cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm
leaflet
readability
golang
1package articles
2
3import (
4 "strings"
5 "testing"
6
7 "golang.org/x/net/html"
8)
9
10func TestHeuristicExtractor(t *testing.T) {
11 t.Run("NewHeuristicExtractor", func(t *testing.T) {
12 t.Run("creates extractor with scorer", func(t *testing.T) {
13 extractor := NewHeuristicExtractor()
14
15 if extractor == nil {
16 t.Fatal("Expected extractor to be created, got nil")
17 }
18
19 if extractor.scorer == nil {
20 t.Error("Expected extractor to have scorer")
21 }
22 })
23 })
24
25 t.Run("ExtractContent", func(t *testing.T) {
26 extractor := NewHeuristicExtractor()
27
28 t.Run("extracts content from article", func(t *testing.T) {
29 htmlStr := `<html><body>
30 <article class="main-content">
31 <p>This is the first paragraph of the article with substantial content.</p>
32 <p>This is the second paragraph with more information and details.</p>
33 <p>And this is the third paragraph to ensure sufficient content.</p>
34 </article>
35 <aside class="sidebar"><a href="#">Sidebar link</a></aside>
36 </body></html>`
37 doc := parseHTML(htmlStr)
38
39 result := extractor.ExtractContent(doc)
40
41 if result == nil {
42 t.Fatal("Expected extraction result, got nil")
43 }
44
45 if result.Content == "" {
46 t.Error("Expected content to be extracted")
47 }
48
49 if result.Confidence == 0.0 {
50 t.Error("Expected non-zero confidence")
51 }
52
53 if !strings.Contains(result.Content, "first paragraph") {
54 t.Error("Expected content to contain article text")
55 }
56 })
57
58 t.Run("returns low confidence for unreadable document", func(t *testing.T) {
59 htmlStr := `<html><body><div>Short</div></body></html>`
60 doc := parseHTML(htmlStr)
61
62 result := extractor.ExtractContent(doc)
63
64 if result == nil {
65 t.Fatal("Expected extraction result, got nil")
66 }
67
68 if result.Confidence > 0.3 {
69 t.Errorf("Expected low confidence for short document, got %f", result.Confidence)
70 }
71 })
72
73 t.Run("returns nil for nil document", func(t *testing.T) {
74 result := extractor.ExtractContent(nil)
75
76 if result != nil {
77 t.Error("Expected nil for nil document")
78 }
79 })
80 })
81
82 t.Run("cleanDocument", func(t *testing.T) {
83 extractor := NewHeuristicExtractor()
84
85 t.Run("removes script and style tags", func(t *testing.T) {
86 htmlStr := `<html><body>
87 <script>alert('test');</script>
88 <style>.test { color: red; }</style>
89 <p>Content</p>
90 </body></html>`
91 doc := parseHTML(htmlStr)
92
93 cleaned := extractor.cleanDocument(doc)
94
95 script := findElement(cleaned, "script")
96 style := findElement(cleaned, "style")
97
98 if script != nil {
99 t.Error("Expected script tag to be removed")
100 }
101
102 if style != nil {
103 t.Error("Expected style tag to be removed")
104 }
105 })
106
107 t.Run("removes hidden elements", func(t *testing.T) {
108 htmlStr := `<html><body>
109 <div style="display:none">Hidden</div>
110 <div hidden>Also hidden</div>
111 <p>Visible</p>
112 </body></html>`
113 doc := parseHTML(htmlStr)
114
115 cleaned := extractor.cleanDocument(doc)
116
117 // Count divs - should only have visible ones
118 divCount := 0
119 var countDivs func(*html.Node)
120 countDivs = func(node *html.Node) {
121 if node.Type == html.ElementNode && node.Data == "div" {
122 divCount++
123 }
124 for child := node.FirstChild; child != nil; child = child.NextSibling {
125 countDivs(child)
126 }
127 }
128 countDivs(cleaned)
129
130 if divCount > 0 {
131 t.Errorf("Expected hidden divs to be removed, found %d", divCount)
132 }
133 })
134
135 t.Run("removes high link density elements", func(t *testing.T) {
136 htmlStr := `<html><body>
137 <div class="links">
138 <a href="#">Link1</a>
139 <a href="#">Link2</a>
140 <a href="#">Link3</a>
141 </div>
142 <p>Regular paragraph with actual content that should remain.</p>
143 </body></html>`
144 doc := parseHTML(htmlStr)
145
146 cleaned := extractor.cleanDocument(doc)
147
148 p := findElement(cleaned, "p")
149 if p == nil {
150 t.Error("Expected paragraph to remain")
151 }
152 })
153 })
154
155 t.Run("extractTextContent", func(t *testing.T) {
156 extractor := NewHeuristicExtractor()
157
158 t.Run("extracts text with basic formatting", func(t *testing.T) {
159 htmlStr := `<html><body><div>
160 <p>First paragraph</p>
161 <p>Second paragraph</p>
162 </div></body></html>`
163 doc := parseHTML(htmlStr)
164 div := findElement(doc, "div")
165
166 text := extractor.extractTextContent(div)
167
168 if !strings.Contains(text, "First paragraph") {
169 t.Error("Expected text to contain first paragraph")
170 }
171
172 if !strings.Contains(text, "Second paragraph") {
173 t.Error("Expected text to contain second paragraph")
174 }
175 })
176
177 t.Run("formats list items with bullets", func(t *testing.T) {
178 htmlStr := `<html><body><ul>
179 <li>Item 1</li>
180 <li>Item 2</li>
181 </ul></body></html>`
182 doc := parseHTML(htmlStr)
183 ul := findElement(doc, "ul")
184
185 text := extractor.extractTextContent(ul)
186
187 if !strings.Contains(text, "•") {
188 t.Error("Expected text to contain bullet points")
189 }
190 })
191
192 t.Run("returns empty string for nil node", func(t *testing.T) {
193 text := extractor.extractTextContent(nil)
194
195 if text != "" {
196 t.Error("Expected empty string for nil node")
197 }
198 })
199 })
200
201 t.Run("CompareWithXPath", func(t *testing.T) {
202 extractor := NewHeuristicExtractor()
203
204 t.Run("high confidence when XPath and heuristics agree", func(t *testing.T) {
205 htmlStr := `<html><body>
206 <article>
207 <p>This is substantial content that both methods should find.</p>
208 <p>Another paragraph with more details and information.</p>
209 <p>And a third paragraph for good measure and completeness.</p>
210 </article>
211 </body></html>`
212 doc := parseHTML(htmlStr)
213 article := findElement(doc, "article")
214
215 result := extractor.CompareWithXPath(doc, article)
216
217 if result == nil {
218 t.Fatal("Expected result, got nil")
219 }
220
221 if result.Confidence < 0.8 {
222 t.Errorf("Expected high confidence when methods agree, got %f", result.Confidence)
223 }
224
225 if !strings.Contains(result.ExtractionMethod, "dual") && !strings.Contains(result.ExtractionMethod, "validated") {
226 t.Errorf("Expected dual validation method, got %s", result.ExtractionMethod)
227 }
228 })
229
230 t.Run("prefers XPath when it extracts more content", func(t *testing.T) {
231 htmlStr := `<html><body>
232 <div class="content">
233 <p>Short content</p>
234 </div>
235 <div class="more">
236 <p>This is additional content that XPath found but heuristics might miss.</p>
237 <p>Even more content here to make a significant difference in length.</p>
238 <p>And yet another paragraph to ensure XPath extraction is substantially longer.</p>
239 </div>
240 </body></html>`
241 doc := parseHTML(htmlStr)
242
243 // XPath would get more content
244 body := findElement(doc, "body")
245
246 result := extractor.CompareWithXPath(doc, body)
247
248 if result == nil {
249 t.Fatal("Expected result, got nil")
250 }
251
252 // Should prefer one method over the other
253 if result.ExtractionMethod == "heuristic" {
254 t.Errorf("Expected method preference, got %s", result.ExtractionMethod)
255 }
256 })
257
258 t.Run("uses heuristics when XPath node is nil", func(t *testing.T) {
259 htmlStr := `<html><body>
260 <article>
261 <p>Content that heuristics should find on its own.</p>
262 <p>Additional paragraph for sufficient content length.</p>
263 <p>Third paragraph to meet minimum requirements.</p>
264 </article>
265 </body></html>`
266 doc := parseHTML(htmlStr)
267
268 result := extractor.CompareWithXPath(doc, nil)
269
270 if result == nil {
271 t.Fatal("Expected result, got nil")
272 }
273
274 if result.ExtractionMethod != "heuristic" {
275 t.Errorf("Expected heuristic method when XPath is nil, got %s", result.ExtractionMethod)
276 }
277 })
278
279 t.Run("returns nil for nil document", func(t *testing.T) {
280 result := extractor.CompareWithXPath(nil, nil)
281
282 if result != nil {
283 t.Error("Expected nil for nil document")
284 }
285 })
286 })
287
288 t.Run("calculateSimilarity", func(t *testing.T) {
289 extractor := NewHeuristicExtractor()
290
291 t.Run("returns high similarity for identical text", func(t *testing.T) {
292 text := "This is some test content"
293
294 similarity := extractor.calculateSimilarity(text, text)
295
296 if similarity < 0.9 {
297 t.Errorf("Expected high similarity for identical text, got %f", similarity)
298 }
299 })
300
301 t.Run("returns low similarity for different text", func(t *testing.T) {
302 text1 := "This is the first piece of content"
303 text2 := "Completely different words and phrases"
304
305 similarity := extractor.calculateSimilarity(text1, text2)
306
307 if similarity > 0.3 {
308 t.Errorf("Expected low similarity for different text, got %f", similarity)
309 }
310 })
311
312 t.Run("returns zero for empty strings", func(t *testing.T) {
313 similarity := extractor.calculateSimilarity("text", "")
314
315 if similarity != 0.0 {
316 t.Errorf("Expected zero similarity for empty string, got %f", similarity)
317 }
318 })
319
320 t.Run("returns one for both empty", func(t *testing.T) {
321 similarity := extractor.calculateSimilarity("", "")
322
323 if similarity != 1.0 {
324 t.Errorf("Expected 1.0 similarity for both empty, got %f", similarity)
325 }
326 })
327 })
328
329 t.Run("ExtractWithSemanticHTML", func(t *testing.T) {
330 extractor := NewHeuristicExtractor()
331
332 t.Run("extracts from article tag", func(t *testing.T) {
333 htmlStr := `<html><body>
334 <nav>Navigation</nav>
335 <article>
336 <p>This is the main article content that should be extracted.</p>
337 <p>Second paragraph of the article with more information.</p>
338 <p>Third paragraph to provide sufficient content length.</p>
339 </article>
340 <aside>Sidebar</aside>
341 </body></html>`
342 doc := parseHTML(htmlStr)
343
344 result := extractor.ExtractWithSemanticHTML(doc)
345
346 if result == nil {
347 t.Fatal("Expected result, got nil")
348 }
349
350 if result.ExtractionMethod != "semantic-html" {
351 t.Errorf("Expected semantic-html method, got %s", result.ExtractionMethod)
352 }
353
354 if !strings.Contains(result.Content, "main article content") {
355 t.Error("Expected content from article tag")
356 }
357
358 if result.Confidence < 0.85 {
359 t.Errorf("Expected high confidence for semantic HTML, got %f", result.Confidence)
360 }
361 })
362
363 t.Run("extracts from main tag", func(t *testing.T) {
364 htmlStr := `<html><body>
365 <header>Header</header>
366 <main>
367 <p>This is the main content area with sufficient text.</p>
368 <p>Additional content paragraph with more details.</p>
369 <p>Third paragraph for completeness and length.</p>
370 </main>
371 <footer>Footer</footer>
372 </body></html>`
373 doc := parseHTML(htmlStr)
374
375 result := extractor.ExtractWithSemanticHTML(doc)
376
377 if result == nil {
378 t.Fatal("Expected result, got nil")
379 }
380
381 if result.ExtractionMethod != "semantic-html" {
382 t.Errorf("Expected semantic-html method, got %s", result.ExtractionMethod)
383 }
384
385 if !strings.Contains(result.Content, "main content area") {
386 t.Error("Expected content from main tag")
387 }
388 })
389
390 t.Run("falls back to heuristics without semantic tags", func(t *testing.T) {
391 htmlStr := `<html><body>
392 <div class="content">
393 <p>Content in a regular div without semantic HTML tags.</p>
394 <p>Second paragraph with additional information.</p>
395 <p>Third paragraph for sufficient content.</p>
396 </div>
397 </body></html>`
398 doc := parseHTML(htmlStr)
399
400 result := extractor.ExtractWithSemanticHTML(doc)
401
402 if result == nil {
403 t.Fatal("Expected result, got nil")
404 }
405
406 if result.ExtractionMethod == "semantic-html" {
407 t.Error("Should not use semantic-html method without semantic tags")
408 }
409 })
410
411 t.Run("returns nil for nil document", func(t *testing.T) {
412 result := extractor.ExtractWithSemanticHTML(nil)
413
414 if result != nil {
415 t.Error("Expected nil for nil document")
416 }
417 })
418 })
419
420 t.Run("isBlockElement", func(t *testing.T) {
421 extractor := NewHeuristicExtractor()
422
423 t.Run("identifies block elements", func(t *testing.T) {
424 blockTags := []string{"p", "div", "article", "h1", "section"}
425
426 for _, tag := range blockTags {
427 if !extractor.isBlockElement(tag) {
428 t.Errorf("Expected %s to be a block element", tag)
429 }
430 }
431 })
432
433 t.Run("identifies non-block elements", func(t *testing.T) {
434 inlineTags := []string{"span", "a", "em", "strong", "code"}
435
436 for _, tag := range inlineTags {
437 if extractor.isBlockElement(tag) {
438 t.Errorf("Expected %s to not be a block element", tag)
439 }
440 }
441 })
442 })
443}