cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm
leaflet
readability
golang
1package articles
2
3import (
4 "errors"
5 "fmt"
6 "io"
7 "net/http"
8 "os"
9 "strings"
10 "testing"
11 "time"
12
13 "github.com/stormlightlabs/noteleaf/internal/models"
14)
15
16// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
17func ExampleParser_Convert() {
18 parser, err := NewArticleParser(http.DefaultClient)
19 if err != nil {
20 fmt.Printf("Failed to create parser: %v\n", err)
21 return
22 }
23
24 htmlPath := "examples/christopher-lloyd.html"
25 htmlContent, err := os.ReadFile(htmlPath)
26 if err != nil {
27 fmt.Printf("Local HTML file not found: %v\n", err)
28 return
29 }
30
31 markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd")
32 if err != nil {
33 fmt.Printf("Failed to convert HTML: %v\n", err)
34 return
35 }
36
37 parts := strings.Split(markdown, "\n---\n")
38 if len(parts) > 0 {
39 frontmatter := strings.TrimSpace(parts[0])
40 lines := strings.Split(frontmatter, "\n")
41
42 for i, line := range lines {
43 if i >= 4 {
44 break
45 }
46
47 if !strings.Contains(line, "**Saved:**") {
48 fmt.Println(line)
49 }
50 }
51 }
52
53 // Output: # Christopher Lloyd
54 //
55 // **Author:** Contributors to Wikimedia projects
56}
57
58func TestArticleParser(t *testing.T) {
59 t.Run("New", func(t *testing.T) {
60 t.Run("successfully creates parser", func(t *testing.T) {
61 parser, err := NewArticleParser(http.DefaultClient)
62 if err != nil {
63 t.Fatalf("Expected no error, got %v", err)
64 }
65 if parser == nil {
66 t.Fatal("Expected parser to be created, got nil")
67 }
68 if len(parser.rules) == 0 {
69 t.Error("Expected rules to be loaded")
70 }
71 })
72
73 t.Run("loads expected domains", func(t *testing.T) {
74 parser, err := NewArticleParser(http.DefaultClient)
75 if err != nil {
76 t.Fatalf("Failed to create parser: %v", err)
77 }
78
79 domains := parser.GetSupportedDomains()
80 expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"}
81
82 if len(domains) != len(expectedDomains) {
83 t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains))
84 }
85
86 domainMap := make(map[string]bool)
87 for _, domain := range domains {
88 domainMap[domain] = true
89 }
90
91 for _, expected := range expectedDomains {
92 if !domainMap[expected] {
93 t.Errorf("Expected domain %s not found in supported domains", expected)
94 }
95 }
96 })
97 })
98
99 t.Run("parseRules", func(t *testing.T) {
100 parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
101
102 t.Run("parses valid rule file", func(t *testing.T) {
103 content := `title: //h1
104author: //span[@class='author']
105date: //time
106body: //article
107strip: //nav
108strip: //footer
109test_url: https://example.com/article`
110
111 rule, err := parser.parseRules("example.com", content)
112 if err != nil {
113 t.Fatalf("Expected no error, got %v", err)
114 }
115
116 if rule.Domain != "example.com" {
117 t.Errorf("Expected domain 'example.com', got %s", rule.Domain)
118 }
119 if rule.Title != "//h1" {
120 t.Errorf("Expected title '//h1', got %s", rule.Title)
121 }
122 if rule.Author != "//span[@class='author']" {
123 t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author)
124 }
125 if len(rule.Strip) != 2 {
126 t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip))
127 }
128 if len(rule.TestURLs) != 1 {
129 t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs))
130 }
131 })
132
133 t.Run("handles empty lines and comments", func(t *testing.T) {
134 content := `# This is a comment
135title: //h1
136
137# Another comment
138body: //article
139`
140
141 rule, err := parser.parseRules("test.com", content)
142 if err != nil {
143 t.Fatalf("Expected no error, got %v", err)
144 }
145
146 if rule.Title != "//h1" {
147 t.Errorf("Expected title '//h1', got %s", rule.Title)
148 }
149 if rule.Body != "//article" {
150 t.Errorf("Expected body '//article', got %s", rule.Body)
151 }
152 })
153 })
154
155 t.Run("slugify", func(t *testing.T) {
156 parser := &ArticleParser{}
157
158 tc := []struct {
159 input string
160 expected string
161 }{
162 {"Simple Title", "simple-title"},
163 {"Title with Numbers 123", "title-with-numbers-123"},
164 {"Title-with-Hyphens", "title-with-hyphens"},
165 {"Title with Spaces and Multiple Spaces", "title-with-spaces-and-multiple-spaces"},
166 {"Title!@#$%^&*()with Special Characters", "title-with-special-characters"},
167 {"", ""},
168 {strings.Repeat("a", 150), strings.Repeat("a", 100)},
169 }
170
171 for _, tt := range tc {
172 t.Run(fmt.Sprintf("slugify '%s'", tt.input), func(t *testing.T) {
173 result := parser.slugify(tt.input)
174 if result != tt.expected {
175 t.Errorf("Expected '%s', got '%s'", tt.expected, result)
176 }
177 })
178 }
179 })
180
181 t.Run("Convert", func(t *testing.T) {
182 parser, err := NewArticleParser(http.DefaultClient)
183 if err != nil {
184 t.Fatalf("Failed to create parser: %v", err)
185 }
186
187 t.Run("fails with unsupported domain", func(t *testing.T) {
188 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
189 _, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article")
190
191 if err == nil {
192 t.Error("Expected error for unsupported domain")
193 }
194
195 if !strings.Contains(err.Error(), "confidence too low") &&
196 !strings.Contains(err.Error(), "could not extract title") {
197 t.Errorf("Expected heuristic extraction error, got %v", err)
198 }
199 })
200
201 t.Run("fails with invalid HTML", func(t *testing.T) {
202 invalidHTML := "<html><head><title>Test</head></body>"
203 _, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
204
205 if err == nil {
206 t.Error("Expected error for invalid HTML")
207 }
208 })
209
210 t.Run("fails when no title extracted", func(t *testing.T) {
211 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
212 _, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
213
214 if err == nil {
215 t.Error("Expected error when no title can be extracted")
216 }
217
218 if !strings.Contains(err.Error(), "could not extract title") &&
219 !strings.Contains(err.Error(), "could not extract body content") &&
220 !strings.Contains(err.Error(), "confidence too low") {
221 t.Errorf("Expected title, body, or confidence error, got %v", err)
222 }
223 })
224
225 t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) {
226 htmlContent := `<html>
227 <head><title>Test Article</title></head>
228 <body>
229 <h1 id="firstHeading">Test Article Title</h1>
230 <div id="bodyContent">
231 <style>.mw-parser-output .hatnote{font-style:italic;}</style>
232 <p>This is the main content of the article.</p>
233 <div class="noprint">This should be stripped</div>
234 <div class="editsection">Edit this section</div>
235 <p>More content here.</p>
236 </div>
237 </body>
238 </html>`
239
240 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
241 if err != nil {
242 t.Fatalf("Expected no error, got %v", err)
243 }
244
245 if !strings.Contains(markdown, "# Test Article Title") {
246 t.Error("Expected markdown to contain title")
247 }
248 if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") {
249 t.Error("Expected markdown to contain source URL")
250 }
251 if !strings.Contains(markdown, "This is the main content") {
252 t.Error("Expected markdown to contain article content")
253 }
254 if strings.Contains(markdown, "This should be stripped") {
255 t.Error("Expected stripped content to be removed from markdown")
256 }
257 if strings.Contains(markdown, ".mw-parser-output") {
258 t.Error("Expected style content to be removed from markdown")
259 }
260 if strings.Contains(markdown, "Edit this section") {
261 t.Error("Expected edit section markers to be removed from markdown")
262 }
263 })
264
265 t.Run("strips Wikipedia navigation boxes and metadata", func(t *testing.T) {
266 htmlContent := `<html>
267 <head><title>Test Navigation Article</title></head>
268 <body>
269 <h1 id="firstHeading">Test Navigation Article</h1>
270 <div id="bodyContent">
271 <p>Main article content goes here.</p>
272 <h2>Section One<span class="mw-editsection">[edit]</span></h2>
273 <p>Section content.</p>
274 <table class="navbox" role="navigation">
275 <tr><td>Navigation item 1</td></tr>
276 <tr><td>Navigation item 2</td></tr>
277 </table>
278 <div class="navbox">
279 <p>Another navigation box</p>
280 </div>
281 <table class="vertical-navbox">
282 <tr><td>Vertical nav item</td></tr>
283 </table>
284 <p>More article content.</p>
285 <div role="navigation">
286 <p>Navigation content</p>
287 </div>
288 <div id="catlinks">
289 <p>Categories: Test Category</p>
290 </div>
291 <div id="footer">
292 <p>Retrieved from Wikipedia</p>
293 </div>
294 </div>
295 </body>
296 </html>`
297
298 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test_Navigation")
299 if err != nil {
300 t.Fatalf("Expected no error, got %v", err)
301 }
302
303 if !strings.Contains(markdown, "Main article content") {
304 t.Error("Expected markdown to contain main article content")
305 }
306 if !strings.Contains(markdown, "Section content") {
307 t.Error("Expected markdown to contain section content")
308 }
309 if !strings.Contains(markdown, "More article content") {
310 t.Error("Expected markdown to contain additional content")
311 }
312
313 if strings.Contains(markdown, "Navigation item") {
314 t.Error("Expected navbox table content to be stripped")
315 }
316 if strings.Contains(markdown, "Another navigation box") {
317 t.Error("Expected navbox div content to be stripped")
318 }
319 if strings.Contains(markdown, "Vertical nav item") {
320 t.Error("Expected vertical-navbox content to be stripped")
321 }
322 if strings.Contains(markdown, "[edit]") {
323 t.Error("Expected edit section markers to be stripped")
324 }
325 if strings.Contains(markdown, "Navigation content") {
326 t.Error("Expected role=navigation content to be stripped")
327 }
328 if strings.Contains(markdown, "Categories:") {
329 t.Error("Expected category links to be stripped")
330 }
331 if strings.Contains(markdown, "Retrieved from") {
332 t.Error("Expected footer content to be stripped")
333 }
334 })
335
336 t.Run("uses heuristic extraction for unsupported domain with semantic HTML", func(t *testing.T) {
337 htmlContent := `<html><head>
338 <title>Heuristic Test Article</title>
339 <meta property="og:author" content="Heuristic Author">
340 <meta property="article:published_time" content="2025-01-15">
341 </head><body>
342 <article>
343 <p>This is a substantial article that should be extracted using heuristic methods.</p>
344 <p>It contains multiple paragraphs with sufficient content for the readability algorithm.</p>
345 <p>The heuristic extractor should successfully identify this as main content.</p>
346 </article>
347 </body></html>`
348
349 markdown, err := parser.Convert(htmlContent, "unsupported-domain.com", "https://unsupported-domain.com/article")
350
351 if err == nil {
352 if !strings.Contains(markdown, "substantial article") {
353 t.Error("Expected markdown to contain extracted content")
354 }
355 }
356 })
357
358 t.Run("includes confidence score in parsed content", func(t *testing.T) {
359 htmlContent := `<html>
360 <head><title>Confidence Test</title></head>
361 <body>
362 <h1 id="firstHeading">Confidence Test Article</h1>
363 <div id="bodyContent">
364 <p>Article content for confidence testing.</p>
365 </div>
366 </body>
367 </html>`
368
369 content, err := parser.Parse(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Confidence")
370 if err != nil {
371 t.Fatalf("Expected no error, got %v", err)
372 }
373
374 if content.Confidence == 0.0 {
375 t.Error("Expected non-zero confidence score")
376 }
377
378 if content.ExtractionMethod == "" {
379 t.Error("Expected extraction method to be set")
380 }
381 })
382
383 t.Run("falls back to metadata extractor when XPath fails", func(t *testing.T) {
384 htmlContent := `<html><head>
385 <title>Metadata Fallback Test</title>
386 <meta property="og:author" content="Metadata Author">
387 <meta property="article:published_time" content="2025-01-20">
388 </head><body>
389 <h1 id="firstHeading">Fallback Test</h1>
390 <div id="bodyContent">
391 <p>Content without author or date in XPath locations.</p>
392 </div>
393 </body></html>`
394
395 content, err := parser.Parse(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Metadata_Test")
396 if err != nil {
397 t.Fatalf("Expected no error, got %v", err)
398 }
399
400 if content.Author != "Metadata Author" {
401 t.Errorf("Expected metadata fallback for author, got %q", content.Author)
402 }
403
404 if content.Date != "2025-01-20" {
405 t.Errorf("Expected metadata fallback for date, got %q", content.Date)
406 }
407 })
408 })
409
410 t.Run("ParseURL", func(t *testing.T) {
411 parser, err := NewArticleParser(http.DefaultClient)
412 if err != nil {
413 t.Fatalf("Failed to create parser: %v", err)
414 }
415
416 localhostRule := &ParsingRule{
417 Domain: "example.com",
418 Title: "//h1[@id='firstHeading']",
419 Body: "//div[@id='bodyContent']",
420 Strip: []string{"//div[@class='noprint']"},
421 }
422 parser.AddRule("example.com", localhostRule)
423
424 const (
425 validURL = "https://example.com/wiki/test"
426 httpErrorURL = "https://example.com/wiki/404"
427 unsupportedURL = "https://unsupported-domain.test/article"
428 )
429
430 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
431 switch req.URL.String() {
432 case validURL:
433 return htmlResponse(http.StatusOK, `<html>
434 <head><title>Test Article</title></head>
435 <body>
436 <h1 id="firstHeading">Test Wikipedia Article</h1>
437 <div id="bodyContent">
438 <p>This is the article content.</p>
439 <div class="noprint">This gets stripped</div>
440 </div>
441 </body>
442 </html>`), nil
443 case httpErrorURL:
444 return &http.Response{
445 StatusCode: http.StatusNotFound,
446 Header: make(http.Header),
447 Body: io.NopCloser(strings.NewReader("")),
448 }, nil
449 case unsupportedURL:
450 return htmlResponse(http.StatusOK, `<html><head><title>Unsupported</title></head><body><p>Content</p></body></html>`), nil
451 default:
452 return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
453 }
454 }))
455
456 t.Run("fails with invalid URL", func(t *testing.T) {
457 _, err := parser.ParseURL("not-a-url")
458 if err == nil {
459 t.Error("Expected error for invalid URL")
460 }
461 if !strings.Contains(err.Error(), "unsupported protocol scheme") &&
462 !strings.Contains(err.Error(), "failed to fetch URL") &&
463 !strings.Contains(err.Error(), "invalid URL") {
464 t.Errorf("Expected URL scheme error, got %v", err)
465 }
466 })
467
468 t.Run("fails with unsupported domain", func(t *testing.T) {
469 _, err := parser.ParseURL(unsupportedURL)
470 if err == nil {
471 t.Error("Expected error for unsupported domain")
472 }
473 })
474
475 t.Run("fails with HTTP error", func(t *testing.T) {
476 _, err := parser.ParseURL(httpErrorURL)
477 if err == nil {
478 t.Error("Expected error for HTTP 404")
479 }
480 })
481
482 t.Run("successfully parses supported domain", func(t *testing.T) {
483 content, err := parser.ParseURL(validURL)
484 if err != nil {
485 t.Fatalf("Expected no error, got %v", err)
486 }
487 if content == nil {
488 t.Fatal("Expected parsed content, got nil")
489 }
490 if content.Title != "Test Wikipedia Article" {
491 t.Errorf("Expected title to be extracted, got %q", content.Title)
492 }
493 if !strings.Contains(content.Content, "This is the article content.") {
494 t.Errorf("Expected content to include article text, got %q", content.Content)
495 }
496 if strings.Contains(content.Content, "This gets stripped") {
497 t.Error("Expected strip rules to remove non-content nodes")
498 }
499 })
500
501 })
502
503 t.Run("SaveArticle", func(t *testing.T) {
504 parser := &ArticleParser{}
505 tempDir := t.TempDir()
506
507 content := &ParsedContent{
508 Title: "Test Article",
509 Author: "Test Author",
510 Date: "2023-01-01",
511 Content: "This is test content.",
512 URL: "https://example.com/test",
513 }
514
515 t.Run("successfully saves article", func(t *testing.T) {
516 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
517 if err != nil {
518 t.Fatalf("Expected no error, got %v", err)
519 }
520
521 if _, err := os.Stat(mdPath); os.IsNotExist(err) {
522 t.Error("Expected markdown file to exist")
523 }
524 if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
525 t.Error("Expected HTML file to exist")
526 }
527
528 mdContent, err := os.ReadFile(mdPath)
529 if err != nil {
530 t.Fatalf("Failed to read markdown file: %v", err)
531 }
532 if !strings.Contains(string(mdContent), "# Test Article") {
533 t.Error("Expected markdown to contain title")
534 }
535 if !strings.Contains(string(mdContent), "**Author:** Test Author") {
536 t.Error("Expected markdown to contain author")
537 }
538
539 htmlContentBytes, err := os.ReadFile(htmlPath)
540 if err != nil {
541 t.Fatalf("Failed to read HTML file: %v", err)
542 }
543 if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") {
544 t.Error("Expected HTML to contain title")
545 }
546 })
547
548 t.Run("handles duplicate filenames", func(t *testing.T) {
549 mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir)
550 if err != nil {
551 t.Fatalf("Expected no error for first save, got %v", err)
552 }
553
554 mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir)
555 if err != nil {
556 t.Fatalf("Expected no error for second save, got %v", err)
557 }
558
559 if mdPath1 == mdPath2 {
560 t.Error("Expected different markdown paths for duplicate saves")
561 }
562 if htmlPath1 == htmlPath2 {
563 t.Error("Expected different HTML paths for duplicate saves")
564 }
565
566 if _, err := os.Stat(mdPath1); os.IsNotExist(err) {
567 t.Error("Expected first markdown file to exist")
568 }
569 if _, err := os.Stat(mdPath2); os.IsNotExist(err) {
570 t.Error("Expected second markdown file to exist")
571 }
572 })
573
574 t.Run("fails with invalid directory", func(t *testing.T) {
575 invalidDir := "/nonexistent/directory"
576 _, _, err := parser.SaveArticle(content, invalidDir)
577 if err == nil {
578 t.Error("Expected error for invalid directory")
579 }
580 })
581 })
582
583 t.Run("createHTML", func(t *testing.T) {
584 parser := &ArticleParser{}
585 content := &ParsedContent{
586 Title: "Test HTML Article",
587 Author: "HTML Author",
588 Date: "2023-12-25",
589 Content: "This is **bold** content with *emphasis*.",
590 URL: "https://example.com/html-test",
591 }
592
593 t.Run("creates valid HTML", func(t *testing.T) {
594 markdown := parser.createMarkdown(content)
595 html := parser.createHTML(content, markdown)
596
597 if !strings.Contains(html, "<!DOCTYPE html>") {
598 t.Error("Expected HTML to contain DOCTYPE")
599 }
600 if !strings.Contains(html, "<title>Test HTML Article</title>") {
601 t.Error("Expected HTML to contain title")
602 }
603 if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") {
604 t.Error("Expected HTML to contain h1 heading with title")
605 }
606 if !strings.Contains(html, "<strong>bold</strong>") {
607 t.Error("Expected HTML to contain bold formatting")
608 }
609 if !strings.Contains(html, "<em>emphasis</em>") {
610 t.Error("Expected HTML to contain emphasis formatting")
611 }
612 })
613 })
614
615 t.Run("createMarkdown", func(t *testing.T) {
616 parser := &ArticleParser{}
617
618 t.Run("creates markdown with all fields", func(t *testing.T) {
619 content := &ParsedContent{
620 Title: "Full Content Article",
621 Author: "Complete Author",
622 Date: "2023-01-15",
623 Content: "Complete article content here.",
624 URL: "https://example.com/full",
625 }
626
627 markdown := parser.createMarkdown(content)
628
629 if !strings.Contains(markdown, "# Full Content Article") {
630 t.Error("Expected markdown to contain title")
631 }
632 if !strings.Contains(markdown, "**Author:** Complete Author") {
633 t.Error("Expected markdown to contain author")
634 }
635 if !strings.Contains(markdown, "**Date:** 2023-01-15") {
636 t.Error("Expected markdown to contain date")
637 }
638 if !strings.Contains(markdown, "**Source:** https://example.com/full") {
639 t.Error("Expected markdown to contain source URL")
640 }
641 if !strings.Contains(markdown, "**Saved:**") {
642 t.Error("Expected markdown to contain saved timestamp")
643 }
644 if !strings.Contains(markdown, "---") {
645 t.Error("Expected markdown to contain separator")
646 }
647 if !strings.Contains(markdown, "Complete article content here.") {
648 t.Error("Expected markdown to contain article content")
649 }
650 })
651
652 t.Run("creates markdown with minimal fields", func(t *testing.T) {
653 content := &ParsedContent{
654 Title: "Minimal Article",
655 Content: "Just content.",
656 URL: "https://example.com/minimal",
657 }
658
659 markdown := parser.createMarkdown(content)
660
661 if !strings.Contains(markdown, "# Minimal Article") {
662 t.Error("Expected markdown to contain title")
663 }
664 if strings.Contains(markdown, "**Author:**") {
665 t.Error("Expected no author field for empty author")
666 }
667 if strings.Contains(markdown, "**Date:**") {
668 t.Error("Expected no date field for empty date")
669 }
670 if !strings.Contains(markdown, "**Source:** https://example.com/minimal") {
671 t.Error("Expected markdown to contain source URL")
672 }
673 })
674 })
675}
676
677func TestCreateArticleFromURL(t *testing.T) {
678 tempDir := t.TempDir()
679
680 t.Run("fails with invalid URL", func(t *testing.T) {
681 _, err := CreateArticleFromURL("not-a-url", tempDir)
682 if err == nil {
683 t.Error("Expected error for invalid URL")
684 }
685 if !strings.Contains(err.Error(), "invalid URL") && !strings.Contains(err.Error(), "failed to parse URL") {
686 t.Errorf("Expected URL parsing error, got %v", err)
687 }
688 })
689
690 t.Run("fails with empty URL", func(t *testing.T) {
691 _, err := CreateArticleFromURL("", tempDir)
692 if err == nil {
693 t.Error("Expected error for empty URL")
694 }
695 })
696
697 t.Run("fails with unsupported domain", func(t *testing.T) {
698 unsupportedURL := "https://unsupported-domain.test/article"
699 withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) {
700 if req.URL.String() == unsupportedURL {
701 return htmlResponse(http.StatusOK, "<html><body><div>Too little content</div></body></html>"), nil
702 }
703 return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
704 })
705
706 _, err := CreateArticleFromURL(unsupportedURL, tempDir)
707 if err == nil {
708 t.Error("Expected error for unsupported domain")
709 }
710 if !strings.Contains(err.Error(), "confidence too low") &&
711 !strings.Contains(err.Error(), "could not extract title") {
712 t.Errorf("Expected heuristic extraction error, got %v", err)
713 }
714 })
715
716 t.Run("fails with HTTP error", func(t *testing.T) {
717 errorURL := "https://example.com/missing"
718 withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) {
719 if req.URL.String() == errorURL {
720 return &http.Response{
721 StatusCode: http.StatusNotFound,
722 Header: make(http.Header),
723 Body: io.NopCloser(strings.NewReader("")),
724 }, nil
725 }
726 return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
727 })
728
729 _, err := CreateArticleFromURL(errorURL, tempDir)
730 if err == nil {
731 t.Error("Expected error for HTTP 404")
732 }
733 if !strings.Contains(err.Error(), "HTTP error") && !strings.Contains(err.Error(), "404") {
734 t.Errorf("Expected HTTP error, got %v", err)
735 }
736 })
737
738 t.Run("fails with network error", func(t *testing.T) {
739 networkURL := "https://example.com/network"
740 withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) {
741 if req.URL.String() == networkURL {
742 return nil, errors.New("dial error")
743 }
744 return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
745 })
746
747 _, err := CreateArticleFromURL(networkURL, tempDir)
748 if err == nil {
749 t.Error("Expected error for network failure")
750 }
751 if !strings.Contains(err.Error(), "failed to fetch URL") && !strings.Contains(err.Error(), "connection refused") {
752 t.Errorf("Expected network error, got %v", err)
753 }
754 })
755
756 t.Run("fails with malformed HTML", func(t *testing.T) {
757 parser, err := NewArticleParser(http.DefaultClient)
758 if err != nil {
759 t.Fatalf("Failed to create parser: %v", err)
760 }
761
762 localhostRule := &ParsingRule{
763 Domain: "example.com",
764 Title: "//h1[@id='firstHeading']",
765 Body: "//div[@id='bodyContent']",
766 Strip: []string{"//div[@class='noprint']"},
767 }
768 parser.AddRule("example.com", localhostRule)
769
770 malformedURL := "https://example.com/malformed"
771 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
772 if req.URL.String() == malformedURL {
773 return htmlResponse(http.StatusOK, "<html><head><title>Test</head></body>"), nil
774 }
775 return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
776 }))
777
778 _, err = parser.ParseURL(malformedURL)
779 if err == nil {
780 t.Error("Expected error for malformed HTML")
781 }
782
783 if !strings.Contains(err.Error(), "failed to parse HTML") &&
784 !strings.Contains(err.Error(), "could not extract title") &&
785 !strings.Contains(err.Error(), "could not extract body content") &&
786 !strings.Contains(err.Error(), "confidence too low") {
787 t.Errorf("Expected HTML parsing or extraction error, got %v", err)
788 }
789 })
790
791 t.Run("fails when no title can be extracted", func(t *testing.T) {
792 parser, err := NewArticleParser(http.DefaultClient)
793 if err != nil {
794 t.Fatalf("Failed to create parser: %v", err)
795 }
796
797 localhostRule := &ParsingRule{
798 Domain: "example.com",
799 Title: "//h1[@id='firstHeading']",
800 Body: "//div[@id='bodyContent']",
801 Strip: []string{"//div[@class='noprint']"},
802 }
803 parser.AddRule("example.com", localhostRule)
804
805 noTitleURL := "https://example.com/notitle"
806 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
807 if req.URL.String() == noTitleURL {
808 return htmlResponse(http.StatusOK, `<html>
809 <head><title>Test</title></head>
810 <body>
811 <div id="bodyContent">
812 <p>Content without proper title</p>
813 </div>
814 </body>
815 </html>`), nil
816 }
817 return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
818 }))
819
820 result, err := parser.ParseURL(noTitleURL)
821
822 if err != nil {
823 if !strings.Contains(err.Error(), "could not extract title") &&
824 !strings.Contains(err.Error(), "confidence too low") {
825 t.Errorf("Expected title extraction error, got %v", err)
826 }
827 } else if result != nil {
828 if result.Title == "" {
829 t.Error("Expected title to be extracted via metadata fallback")
830 }
831 }
832 })
833
834 t.Run("successfully creates article structure from parsed content", func(t *testing.T) {
835 wikipediaHTML := `<html>
836 <head><title>Integration Test Article</title></head>
837 <body>
838 <h1 id="firstHeading">Integration Test Article</h1>
839 <div id="bodyContent">
840 <p>This is integration test content.</p>
841 <div class="noprint">This should be stripped</div>
842 <p>More content here.</p>
843 </div>
844 </body>
845 </html>`
846
847 parser, err := NewArticleParser(http.DefaultClient)
848 if err != nil {
849 t.Fatalf("Failed to create parser: %v", err)
850 }
851
852 localhostRule := &ParsingRule{
853 Domain: "example.com",
854 Title: "//h1[@id='firstHeading']",
855 Body: "//div[@id='bodyContent']",
856 Strip: []string{"//div[@class='noprint']"},
857 }
858 parser.AddRule("example.com", localhostRule)
859
860 contentURL := "https://example.com/integration"
861 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
862 if req.URL.String() == contentURL {
863 return htmlResponse(http.StatusOK, wikipediaHTML), nil
864 }
865 return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
866 }))
867
868 content, err := parser.ParseURL(contentURL)
869 if err != nil {
870 t.Fatalf("Expected no error, got %v", err)
871 }
872
873 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
874 if err != nil {
875 t.Fatalf("Failed to save article: %v", err)
876 }
877
878 article := &models.Article{
879 URL: contentURL,
880 Title: content.Title,
881 MarkdownPath: mdPath,
882 HTMLPath: htmlPath,
883 Created: time.Now(),
884 Modified: time.Now(),
885 }
886
887 if article.Title != "Integration Test Article" {
888 t.Errorf("Expected title 'Integration Test Article', got %s", article.Title)
889 }
890 if article.URL != contentURL {
891 t.Errorf("Expected URL %s, got %s", contentURL, article.URL)
892 }
893 if article.MarkdownPath == "" {
894 t.Error("Expected non-empty markdown path")
895 }
896 if article.HTMLPath == "" {
897 t.Error("Expected non-empty HTML path")
898 }
899 if article.Created.IsZero() {
900 t.Error("Expected Created timestamp to be set")
901 }
902 if article.Modified.IsZero() {
903 t.Error("Expected Modified timestamp to be set")
904 }
905
906 if _, err := os.Stat(article.MarkdownPath); os.IsNotExist(err) {
907 t.Error("Expected markdown file to exist")
908 }
909 if _, err := os.Stat(article.HTMLPath); os.IsNotExist(err) {
910 t.Error("Expected HTML file to exist")
911 }
912
913 mdContent, err := os.ReadFile(article.MarkdownPath)
914 if err != nil {
915 t.Fatalf("Failed to read markdown file: %v", err)
916 }
917 if !strings.Contains(string(mdContent), "# Integration Test Article") {
918 t.Error("Expected markdown to contain title")
919 }
920 if !strings.Contains(string(mdContent), "This is integration test content") {
921 t.Error("Expected markdown to contain article content")
922 }
923 if strings.Contains(string(mdContent), "This should be stripped") {
924 t.Error("Expected stripped content to be removed from markdown")
925 }
926
927 htmlContent, err := os.ReadFile(article.HTMLPath)
928 if err != nil {
929 t.Fatalf("Failed to read HTML file: %v", err)
930 }
931 if !strings.Contains(string(htmlContent), "<title>Integration Test Article</title>") {
932 t.Error("Expected HTML to contain title")
933 }
934 if !strings.Contains(string(htmlContent), "<!DOCTYPE html>") {
935 t.Error("Expected HTML to contain DOCTYPE")
936 }
937 })
938
939 t.Run("successfully handles article with metadata", func(t *testing.T) {
940 contentHTML := `<html>
941 <head>
942 <title>Test Paper</title>
943 <meta name="citation_author" content="Dr. Test Author">
944 <meta name="citation_date" content="2024-01-01">
945 </head>
946 <body>
947 <h1 class="title">Test Research Paper</h1>
948 <blockquote class="abstract">
949 <p>This is the abstract of the research paper.</p>
950 <p>It contains important research findings.</p>
951 </blockquote>
952 </body>
953 </html>`
954
955 parser, err := NewArticleParser(http.DefaultClient)
956 if err != nil {
957 t.Fatalf("Failed to create parser: %v", err)
958 }
959
960 localhostRule := &ParsingRule{
961 Domain: "example.com",
962 Title: "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]",
963 Body: "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]",
964 Date: "//meta[@name='citation_date']/@content",
965 Author: "//meta[@name='citation_author']/@content",
966 }
967 parser.AddRule("example.com", localhostRule)
968
969 contentURL := "https://example.com/metadata"
970 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
971 if req.URL.String() == contentURL {
972 return htmlResponse(http.StatusOK, contentHTML), nil
973 }
974 return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
975 }))
976
977 content, err := parser.ParseURL(contentURL)
978 if err != nil {
979 t.Fatalf("Expected no error, got %v", err)
980 }
981
982 if content.Title != "Test Research Paper" {
983 t.Errorf("Expected title 'Test Research Paper', got %s", content.Title)
984 }
985 if content.Author != "Dr. Test Author" {
986 t.Errorf("Expected author 'Dr. Test Author', got %s", content.Author)
987 }
988 if content.Date != "2024-01-01" {
989 t.Errorf("Expected date '2024-01-01', got %s", content.Date)
990 }
991
992 mdPath, _, err := parser.SaveArticle(content, tempDir)
993 if err != nil {
994 t.Fatalf("Failed to save article: %v", err)
995 }
996
997 mdContent, err := os.ReadFile(mdPath)
998 if err != nil {
999 t.Fatalf("Failed to read markdown file: %v", err)
1000 }
1001 if !strings.Contains(string(mdContent), "**Author:** Dr. Test Author") {
1002 t.Error("Expected markdown to contain author")
1003 }
1004 if !strings.Contains(string(mdContent), "**Date:** 2024-01-01") {
1005 t.Error("Expected markdown to contain date")
1006 }
1007
1008 article := &models.Article{
1009 Author: content.Author,
1010 Date: content.Date,
1011 }
1012
1013 if article.Author != "Dr. Test Author" {
1014 t.Errorf("Expected article author 'Dr. Test Author', got %s", article.Author)
1015 }
1016 if article.Date != "2024-01-01" {
1017 t.Errorf("Expected article date '2024-01-01', got %s", article.Date)
1018 }
1019 })
1020}
1021
1022type roundTripFunc func(*http.Request) (*http.Response, error)
1023
1024func (f roundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) {
1025 return f(req)
1026}
1027
1028func newMockHTTPClient(t *testing.T, fn roundTripFunc) *http.Client {
1029 t.Helper()
1030 return &http.Client{Transport: fn}
1031}
1032
1033func htmlResponse(status int, body string) *http.Response {
1034 return &http.Response{
1035 StatusCode: status,
1036 Header: http.Header{"Content-Type": []string{"text/html; charset=utf-8"}},
1037 Body: io.NopCloser(strings.NewReader(body)),
1038 }
1039}
1040
1041func withDefaultHTTPClient(t *testing.T, fn roundTripFunc) {
1042 t.Helper()
1043 original := http.DefaultClient.Transport
1044 http.DefaultClient.Transport = fn
1045 t.Cleanup(func() {
1046 http.DefaultClient.Transport = original
1047 })
1048}