internal/articles/heuristics_test.go at main · desertthunder.dev/noteleaf

cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
noteleaf / internal / articles / heuristics_test.go
at main 443 lines 12 kB view raw
  1package articles
  2
  3import (
  4	"strings"
  5	"testing"
  6
  7	"golang.org/x/net/html"
  8)
  9
 10func TestHeuristicExtractor(t *testing.T) {
 11	t.Run("NewHeuristicExtractor", func(t *testing.T) {
 12		t.Run("creates extractor with scorer", func(t *testing.T) {
 13			extractor := NewHeuristicExtractor()
 14
 15			if extractor == nil {
 16				t.Fatal("Expected extractor to be created, got nil")
 17			}
 18
 19			if extractor.scorer == nil {
 20				t.Error("Expected extractor to have scorer")
 21			}
 22		})
 23	})
 24
 25	t.Run("ExtractContent", func(t *testing.T) {
 26		extractor := NewHeuristicExtractor()
 27
 28		t.Run("extracts content from article", func(t *testing.T) {
 29			htmlStr := `<html><body>
 30				<article class="main-content">
 31					<p>This is the first paragraph of the article with substantial content.</p>
 32					<p>This is the second paragraph with more information and details.</p>
 33					<p>And this is the third paragraph to ensure sufficient content.</p>
 34				</article>
 35				<aside class="sidebar"><a href="#">Sidebar link</a></aside>
 36			</body></html>`
 37			doc := parseHTML(htmlStr)
 38
 39			result := extractor.ExtractContent(doc)
 40
 41			if result == nil {
 42				t.Fatal("Expected extraction result, got nil")
 43			}
 44
 45			if result.Content == "" {
 46				t.Error("Expected content to be extracted")
 47			}
 48
 49			if result.Confidence == 0.0 {
 50				t.Error("Expected non-zero confidence")
 51			}
 52
 53			if !strings.Contains(result.Content, "first paragraph") {
 54				t.Error("Expected content to contain article text")
 55			}
 56		})
 57
 58		t.Run("returns low confidence for unreadable document", func(t *testing.T) {
 59			htmlStr := `<html><body><div>Short</div></body></html>`
 60			doc := parseHTML(htmlStr)
 61
 62			result := extractor.ExtractContent(doc)
 63
 64			if result == nil {
 65				t.Fatal("Expected extraction result, got nil")
 66			}
 67
 68			if result.Confidence > 0.3 {
 69				t.Errorf("Expected low confidence for short document, got %f", result.Confidence)
 70			}
 71		})
 72
 73		t.Run("returns nil for nil document", func(t *testing.T) {
 74			result := extractor.ExtractContent(nil)
 75
 76			if result != nil {
 77				t.Error("Expected nil for nil document")
 78			}
 79		})
 80	})
 81
 82	t.Run("cleanDocument", func(t *testing.T) {
 83		extractor := NewHeuristicExtractor()
 84
 85		t.Run("removes script and style tags", func(t *testing.T) {
 86			htmlStr := `<html><body>
 87				<script>alert('test');</script>
 88				<style>.test { color: red; }</style>
 89				<p>Content</p>
 90			</body></html>`
 91			doc := parseHTML(htmlStr)
 92
 93			cleaned := extractor.cleanDocument(doc)
 94
 95			script := findElement(cleaned, "script")
 96			style := findElement(cleaned, "style")
 97
 98			if script != nil {
 99				t.Error("Expected script tag to be removed")
100			}
101
102			if style != nil {
103				t.Error("Expected style tag to be removed")
104			}
105		})
106
107		t.Run("removes hidden elements", func(t *testing.T) {
108			htmlStr := `<html><body>
109				<div style="display:none">Hidden</div>
110				<div hidden>Also hidden</div>
111				<p>Visible</p>
112			</body></html>`
113			doc := parseHTML(htmlStr)
114
115			cleaned := extractor.cleanDocument(doc)
116
117			// Count divs - should only have visible ones
118			divCount := 0
119			var countDivs func(*html.Node)
120			countDivs = func(node *html.Node) {
121				if node.Type == html.ElementNode && node.Data == "div" {
122					divCount++
123				}
124				for child := node.FirstChild; child != nil; child = child.NextSibling {
125					countDivs(child)
126				}
127			}
128			countDivs(cleaned)
129
130			if divCount > 0 {
131				t.Errorf("Expected hidden divs to be removed, found %d", divCount)
132			}
133		})
134
135		t.Run("removes high link density elements", func(t *testing.T) {
136			htmlStr := `<html><body>
137				<div class="links">
138					<a href="#">Link1</a>
139					<a href="#">Link2</a>
140					<a href="#">Link3</a>
141				</div>
142				<p>Regular paragraph with actual content that should remain.</p>
143			</body></html>`
144			doc := parseHTML(htmlStr)
145
146			cleaned := extractor.cleanDocument(doc)
147
148			p := findElement(cleaned, "p")
149			if p == nil {
150				t.Error("Expected paragraph to remain")
151			}
152		})
153	})
154
155	t.Run("extractTextContent", func(t *testing.T) {
156		extractor := NewHeuristicExtractor()
157
158		t.Run("extracts text with basic formatting", func(t *testing.T) {
159			htmlStr := `<html><body><div>
160				<p>First paragraph</p>
161				<p>Second paragraph</p>
162			</div></body></html>`
163			doc := parseHTML(htmlStr)
164			div := findElement(doc, "div")
165
166			text := extractor.extractTextContent(div)
167
168			if !strings.Contains(text, "First paragraph") {
169				t.Error("Expected text to contain first paragraph")
170			}
171
172			if !strings.Contains(text, "Second paragraph") {
173				t.Error("Expected text to contain second paragraph")
174			}
175		})
176
177		t.Run("formats list items with bullets", func(t *testing.T) {
178			htmlStr := `<html><body><ul>
179				<li>Item 1</li>
180				<li>Item 2</li>
181			</ul></body></html>`
182			doc := parseHTML(htmlStr)
183			ul := findElement(doc, "ul")
184
185			text := extractor.extractTextContent(ul)
186
187			if !strings.Contains(text, "•") {
188				t.Error("Expected text to contain bullet points")
189			}
190		})
191
192		t.Run("returns empty string for nil node", func(t *testing.T) {
193			text := extractor.extractTextContent(nil)
194
195			if text != "" {
196				t.Error("Expected empty string for nil node")
197			}
198		})
199	})
200
201	t.Run("CompareWithXPath", func(t *testing.T) {
202		extractor := NewHeuristicExtractor()
203
204		t.Run("high confidence when XPath and heuristics agree", func(t *testing.T) {
205			htmlStr := `<html><body>
206				<article>
207					<p>This is substantial content that both methods should find.</p>
208					<p>Another paragraph with more details and information.</p>
209					<p>And a third paragraph for good measure and completeness.</p>
210				</article>
211			</body></html>`
212			doc := parseHTML(htmlStr)
213			article := findElement(doc, "article")
214
215			result := extractor.CompareWithXPath(doc, article)
216
217			if result == nil {
218				t.Fatal("Expected result, got nil")
219			}
220
221			if result.Confidence < 0.8 {
222				t.Errorf("Expected high confidence when methods agree, got %f", result.Confidence)
223			}
224
225			if !strings.Contains(result.ExtractionMethod, "dual") && !strings.Contains(result.ExtractionMethod, "validated") {
226				t.Errorf("Expected dual validation method, got %s", result.ExtractionMethod)
227			}
228		})
229
230		t.Run("prefers XPath when it extracts more content", func(t *testing.T) {
231			htmlStr := `<html><body>
232				<div class="content">
233					<p>Short content</p>
234				</div>
235				<div class="more">
236					<p>This is additional content that XPath found but heuristics might miss.</p>
237					<p>Even more content here to make a significant difference in length.</p>
238					<p>And yet another paragraph to ensure XPath extraction is substantially longer.</p>
239				</div>
240			</body></html>`
241			doc := parseHTML(htmlStr)
242
243			// XPath would get more content
244			body := findElement(doc, "body")
245
246			result := extractor.CompareWithXPath(doc, body)
247
248			if result == nil {
249				t.Fatal("Expected result, got nil")
250			}
251
252			// Should prefer one method over the other
253			if result.ExtractionMethod == "heuristic" {
254				t.Errorf("Expected method preference, got %s", result.ExtractionMethod)
255			}
256		})
257
258		t.Run("uses heuristics when XPath node is nil", func(t *testing.T) {
259			htmlStr := `<html><body>
260				<article>
261					<p>Content that heuristics should find on its own.</p>
262					<p>Additional paragraph for sufficient content length.</p>
263					<p>Third paragraph to meet minimum requirements.</p>
264				</article>
265			</body></html>`
266			doc := parseHTML(htmlStr)
267
268			result := extractor.CompareWithXPath(doc, nil)
269
270			if result == nil {
271				t.Fatal("Expected result, got nil")
272			}
273
274			if result.ExtractionMethod != "heuristic" {
275				t.Errorf("Expected heuristic method when XPath is nil, got %s", result.ExtractionMethod)
276			}
277		})
278
279		t.Run("returns nil for nil document", func(t *testing.T) {
280			result := extractor.CompareWithXPath(nil, nil)
281
282			if result != nil {
283				t.Error("Expected nil for nil document")
284			}
285		})
286	})
287
288	t.Run("calculateSimilarity", func(t *testing.T) {
289		extractor := NewHeuristicExtractor()
290
291		t.Run("returns high similarity for identical text", func(t *testing.T) {
292			text := "This is some test content"
293
294			similarity := extractor.calculateSimilarity(text, text)
295
296			if similarity < 0.9 {
297				t.Errorf("Expected high similarity for identical text, got %f", similarity)
298			}
299		})
300
301		t.Run("returns low similarity for different text", func(t *testing.T) {
302			text1 := "This is the first piece of content"
303			text2 := "Completely different words and phrases"
304
305			similarity := extractor.calculateSimilarity(text1, text2)
306
307			if similarity > 0.3 {
308				t.Errorf("Expected low similarity for different text, got %f", similarity)
309			}
310		})
311
312		t.Run("returns zero for empty strings", func(t *testing.T) {
313			similarity := extractor.calculateSimilarity("text", "")
314
315			if similarity != 0.0 {
316				t.Errorf("Expected zero similarity for empty string, got %f", similarity)
317			}
318		})
319
320		t.Run("returns one for both empty", func(t *testing.T) {
321			similarity := extractor.calculateSimilarity("", "")
322
323			if similarity != 1.0 {
324				t.Errorf("Expected 1.0 similarity for both empty, got %f", similarity)
325			}
326		})
327	})
328
329	t.Run("ExtractWithSemanticHTML", func(t *testing.T) {
330		extractor := NewHeuristicExtractor()
331
332		t.Run("extracts from article tag", func(t *testing.T) {
333			htmlStr := `<html><body>
334				<nav>Navigation</nav>
335				<article>
336					<p>This is the main article content that should be extracted.</p>
337					<p>Second paragraph of the article with more information.</p>
338					<p>Third paragraph to provide sufficient content length.</p>
339				</article>
340				<aside>Sidebar</aside>
341			</body></html>`
342			doc := parseHTML(htmlStr)
343
344			result := extractor.ExtractWithSemanticHTML(doc)
345
346			if result == nil {
347				t.Fatal("Expected result, got nil")
348			}
349
350			if result.ExtractionMethod != "semantic-html" {
351				t.Errorf("Expected semantic-html method, got %s", result.ExtractionMethod)
352			}
353
354			if !strings.Contains(result.Content, "main article content") {
355				t.Error("Expected content from article tag")
356			}
357
358			if result.Confidence < 0.85 {
359				t.Errorf("Expected high confidence for semantic HTML, got %f", result.Confidence)
360			}
361		})
362
363		t.Run("extracts from main tag", func(t *testing.T) {
364			htmlStr := `<html><body>
365				<header>Header</header>
366				<main>
367					<p>This is the main content area with sufficient text.</p>
368					<p>Additional content paragraph with more details.</p>
369					<p>Third paragraph for completeness and length.</p>
370				</main>
371				<footer>Footer</footer>
372			</body></html>`
373			doc := parseHTML(htmlStr)
374
375			result := extractor.ExtractWithSemanticHTML(doc)
376
377			if result == nil {
378				t.Fatal("Expected result, got nil")
379			}
380
381			if result.ExtractionMethod != "semantic-html" {
382				t.Errorf("Expected semantic-html method, got %s", result.ExtractionMethod)
383			}
384
385			if !strings.Contains(result.Content, "main content area") {
386				t.Error("Expected content from main tag")
387			}
388		})
389
390		t.Run("falls back to heuristics without semantic tags", func(t *testing.T) {
391			htmlStr := `<html><body>
392				<div class="content">
393					<p>Content in a regular div without semantic HTML tags.</p>
394					<p>Second paragraph with additional information.</p>
395					<p>Third paragraph for sufficient content.</p>
396				</div>
397			</body></html>`
398			doc := parseHTML(htmlStr)
399
400			result := extractor.ExtractWithSemanticHTML(doc)
401
402			if result == nil {
403				t.Fatal("Expected result, got nil")
404			}
405
406			if result.ExtractionMethod == "semantic-html" {
407				t.Error("Should not use semantic-html method without semantic tags")
408			}
409		})
410
411		t.Run("returns nil for nil document", func(t *testing.T) {
412			result := extractor.ExtractWithSemanticHTML(nil)
413
414			if result != nil {
415				t.Error("Expected nil for nil document")
416			}
417		})
418	})
419
420	t.Run("isBlockElement", func(t *testing.T) {
421		extractor := NewHeuristicExtractor()
422
423		t.Run("identifies block elements", func(t *testing.T) {
424			blockTags := []string{"p", "div", "article", "h1", "section"}
425
426			for _, tag := range blockTags {
427				if !extractor.isBlockElement(tag) {
428					t.Errorf("Expected %s to be a block element", tag)
429				}
430			}
431		})
432
433		t.Run("identifies non-block elements", func(t *testing.T) {
434			inlineTags := []string{"span", "a", "em", "strong", "code"}
435
436			for _, tag := range inlineTags {
437				if extractor.isBlockElement(tag) {
438					t.Errorf("Expected %s to not be a block element", tag)
439				}
440			}
441		})
442	})
443}