cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
at main 1048 lines 33 kB view raw
1package articles 2 3import ( 4 "errors" 5 "fmt" 6 "io" 7 "net/http" 8 "os" 9 "strings" 10 "testing" 11 "time" 12 13 "github.com/stormlightlabs/noteleaf/internal/models" 14) 15 16// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules. 17func ExampleParser_Convert() { 18 parser, err := NewArticleParser(http.DefaultClient) 19 if err != nil { 20 fmt.Printf("Failed to create parser: %v\n", err) 21 return 22 } 23 24 htmlPath := "examples/christopher-lloyd.html" 25 htmlContent, err := os.ReadFile(htmlPath) 26 if err != nil { 27 fmt.Printf("Local HTML file not found: %v\n", err) 28 return 29 } 30 31 markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd") 32 if err != nil { 33 fmt.Printf("Failed to convert HTML: %v\n", err) 34 return 35 } 36 37 parts := strings.Split(markdown, "\n---\n") 38 if len(parts) > 0 { 39 frontmatter := strings.TrimSpace(parts[0]) 40 lines := strings.Split(frontmatter, "\n") 41 42 for i, line := range lines { 43 if i >= 4 { 44 break 45 } 46 47 if !strings.Contains(line, "**Saved:**") { 48 fmt.Println(line) 49 } 50 } 51 } 52 53 // Output: # Christopher Lloyd 54 // 55 // **Author:** Contributors to Wikimedia projects 56} 57 58func TestArticleParser(t *testing.T) { 59 t.Run("New", func(t *testing.T) { 60 t.Run("successfully creates parser", func(t *testing.T) { 61 parser, err := NewArticleParser(http.DefaultClient) 62 if err != nil { 63 t.Fatalf("Expected no error, got %v", err) 64 } 65 if parser == nil { 66 t.Fatal("Expected parser to be created, got nil") 67 } 68 if len(parser.rules) == 0 { 69 t.Error("Expected rules to be loaded") 70 } 71 }) 72 73 t.Run("loads expected domains", func(t *testing.T) { 74 parser, err := NewArticleParser(http.DefaultClient) 75 if err != nil { 76 t.Fatalf("Failed to create parser: %v", err) 77 } 78 79 domains := parser.GetSupportedDomains() 80 expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"} 81 82 if len(domains) != len(expectedDomains) { 83 t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains)) 84 } 85 86 domainMap := make(map[string]bool) 87 for _, domain := range domains { 88 domainMap[domain] = true 89 } 90 91 for _, expected := range expectedDomains { 92 if !domainMap[expected] { 93 t.Errorf("Expected domain %s not found in supported domains", expected) 94 } 95 } 96 }) 97 }) 98 99 t.Run("parseRules", func(t *testing.T) { 100 parser := &ArticleParser{rules: make(map[string]*ParsingRule)} 101 102 t.Run("parses valid rule file", func(t *testing.T) { 103 content := `title: //h1 104author: //span[@class='author'] 105date: //time 106body: //article 107strip: //nav 108strip: //footer 109test_url: https://example.com/article` 110 111 rule, err := parser.parseRules("example.com", content) 112 if err != nil { 113 t.Fatalf("Expected no error, got %v", err) 114 } 115 116 if rule.Domain != "example.com" { 117 t.Errorf("Expected domain 'example.com', got %s", rule.Domain) 118 } 119 if rule.Title != "//h1" { 120 t.Errorf("Expected title '//h1', got %s", rule.Title) 121 } 122 if rule.Author != "//span[@class='author']" { 123 t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author) 124 } 125 if len(rule.Strip) != 2 { 126 t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip)) 127 } 128 if len(rule.TestURLs) != 1 { 129 t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs)) 130 } 131 }) 132 133 t.Run("handles empty lines and comments", func(t *testing.T) { 134 content := `# This is a comment 135title: //h1 136 137# Another comment 138body: //article 139` 140 141 rule, err := parser.parseRules("test.com", content) 142 if err != nil { 143 t.Fatalf("Expected no error, got %v", err) 144 } 145 146 if rule.Title != "//h1" { 147 t.Errorf("Expected title '//h1', got %s", rule.Title) 148 } 149 if rule.Body != "//article" { 150 t.Errorf("Expected body '//article', got %s", rule.Body) 151 } 152 }) 153 }) 154 155 t.Run("slugify", func(t *testing.T) { 156 parser := &ArticleParser{} 157 158 tc := []struct { 159 input string 160 expected string 161 }{ 162 {"Simple Title", "simple-title"}, 163 {"Title with Numbers 123", "title-with-numbers-123"}, 164 {"Title-with-Hyphens", "title-with-hyphens"}, 165 {"Title with Spaces and Multiple Spaces", "title-with-spaces-and-multiple-spaces"}, 166 {"Title!@#$%^&*()with Special Characters", "title-with-special-characters"}, 167 {"", ""}, 168 {strings.Repeat("a", 150), strings.Repeat("a", 100)}, 169 } 170 171 for _, tt := range tc { 172 t.Run(fmt.Sprintf("slugify '%s'", tt.input), func(t *testing.T) { 173 result := parser.slugify(tt.input) 174 if result != tt.expected { 175 t.Errorf("Expected '%s', got '%s'", tt.expected, result) 176 } 177 }) 178 } 179 }) 180 181 t.Run("Convert", func(t *testing.T) { 182 parser, err := NewArticleParser(http.DefaultClient) 183 if err != nil { 184 t.Fatalf("Failed to create parser: %v", err) 185 } 186 187 t.Run("fails with unsupported domain", func(t *testing.T) { 188 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>" 189 _, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article") 190 191 if err == nil { 192 t.Error("Expected error for unsupported domain") 193 } 194 195 if !strings.Contains(err.Error(), "confidence too low") && 196 !strings.Contains(err.Error(), "could not extract title") { 197 t.Errorf("Expected heuristic extraction error, got %v", err) 198 } 199 }) 200 201 t.Run("fails with invalid HTML", func(t *testing.T) { 202 invalidHTML := "<html><head><title>Test</head></body>" 203 _, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 204 205 if err == nil { 206 t.Error("Expected error for invalid HTML") 207 } 208 }) 209 210 t.Run("fails when no title extracted", func(t *testing.T) { 211 htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>" 212 _, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 213 214 if err == nil { 215 t.Error("Expected error when no title can be extracted") 216 } 217 218 if !strings.Contains(err.Error(), "could not extract title") && 219 !strings.Contains(err.Error(), "could not extract body content") && 220 !strings.Contains(err.Error(), "confidence too low") { 221 t.Errorf("Expected title, body, or confidence error, got %v", err) 222 } 223 }) 224 225 t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) { 226 htmlContent := `<html> 227 <head><title>Test Article</title></head> 228 <body> 229 <h1 id="firstHeading">Test Article Title</h1> 230 <div id="bodyContent"> 231 <style>.mw-parser-output .hatnote{font-style:italic;}</style> 232 <p>This is the main content of the article.</p> 233 <div class="noprint">This should be stripped</div> 234 <div class="editsection">Edit this section</div> 235 <p>More content here.</p> 236 </div> 237 </body> 238 </html>` 239 240 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test") 241 if err != nil { 242 t.Fatalf("Expected no error, got %v", err) 243 } 244 245 if !strings.Contains(markdown, "# Test Article Title") { 246 t.Error("Expected markdown to contain title") 247 } 248 if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") { 249 t.Error("Expected markdown to contain source URL") 250 } 251 if !strings.Contains(markdown, "This is the main content") { 252 t.Error("Expected markdown to contain article content") 253 } 254 if strings.Contains(markdown, "This should be stripped") { 255 t.Error("Expected stripped content to be removed from markdown") 256 } 257 if strings.Contains(markdown, ".mw-parser-output") { 258 t.Error("Expected style content to be removed from markdown") 259 } 260 if strings.Contains(markdown, "Edit this section") { 261 t.Error("Expected edit section markers to be removed from markdown") 262 } 263 }) 264 265 t.Run("strips Wikipedia navigation boxes and metadata", func(t *testing.T) { 266 htmlContent := `<html> 267 <head><title>Test Navigation Article</title></head> 268 <body> 269 <h1 id="firstHeading">Test Navigation Article</h1> 270 <div id="bodyContent"> 271 <p>Main article content goes here.</p> 272 <h2>Section One<span class="mw-editsection">[edit]</span></h2> 273 <p>Section content.</p> 274 <table class="navbox" role="navigation"> 275 <tr><td>Navigation item 1</td></tr> 276 <tr><td>Navigation item 2</td></tr> 277 </table> 278 <div class="navbox"> 279 <p>Another navigation box</p> 280 </div> 281 <table class="vertical-navbox"> 282 <tr><td>Vertical nav item</td></tr> 283 </table> 284 <p>More article content.</p> 285 <div role="navigation"> 286 <p>Navigation content</p> 287 </div> 288 <div id="catlinks"> 289 <p>Categories: Test Category</p> 290 </div> 291 <div id="footer"> 292 <p>Retrieved from Wikipedia</p> 293 </div> 294 </div> 295 </body> 296 </html>` 297 298 markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test_Navigation") 299 if err != nil { 300 t.Fatalf("Expected no error, got %v", err) 301 } 302 303 if !strings.Contains(markdown, "Main article content") { 304 t.Error("Expected markdown to contain main article content") 305 } 306 if !strings.Contains(markdown, "Section content") { 307 t.Error("Expected markdown to contain section content") 308 } 309 if !strings.Contains(markdown, "More article content") { 310 t.Error("Expected markdown to contain additional content") 311 } 312 313 if strings.Contains(markdown, "Navigation item") { 314 t.Error("Expected navbox table content to be stripped") 315 } 316 if strings.Contains(markdown, "Another navigation box") { 317 t.Error("Expected navbox div content to be stripped") 318 } 319 if strings.Contains(markdown, "Vertical nav item") { 320 t.Error("Expected vertical-navbox content to be stripped") 321 } 322 if strings.Contains(markdown, "[edit]") { 323 t.Error("Expected edit section markers to be stripped") 324 } 325 if strings.Contains(markdown, "Navigation content") { 326 t.Error("Expected role=navigation content to be stripped") 327 } 328 if strings.Contains(markdown, "Categories:") { 329 t.Error("Expected category links to be stripped") 330 } 331 if strings.Contains(markdown, "Retrieved from") { 332 t.Error("Expected footer content to be stripped") 333 } 334 }) 335 336 t.Run("uses heuristic extraction for unsupported domain with semantic HTML", func(t *testing.T) { 337 htmlContent := `<html><head> 338 <title>Heuristic Test Article</title> 339 <meta property="og:author" content="Heuristic Author"> 340 <meta property="article:published_time" content="2025-01-15"> 341 </head><body> 342 <article> 343 <p>This is a substantial article that should be extracted using heuristic methods.</p> 344 <p>It contains multiple paragraphs with sufficient content for the readability algorithm.</p> 345 <p>The heuristic extractor should successfully identify this as main content.</p> 346 </article> 347 </body></html>` 348 349 markdown, err := parser.Convert(htmlContent, "unsupported-domain.com", "https://unsupported-domain.com/article") 350 351 if err == nil { 352 if !strings.Contains(markdown, "substantial article") { 353 t.Error("Expected markdown to contain extracted content") 354 } 355 } 356 }) 357 358 t.Run("includes confidence score in parsed content", func(t *testing.T) { 359 htmlContent := `<html> 360 <head><title>Confidence Test</title></head> 361 <body> 362 <h1 id="firstHeading">Confidence Test Article</h1> 363 <div id="bodyContent"> 364 <p>Article content for confidence testing.</p> 365 </div> 366 </body> 367 </html>` 368 369 content, err := parser.Parse(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Confidence") 370 if err != nil { 371 t.Fatalf("Expected no error, got %v", err) 372 } 373 374 if content.Confidence == 0.0 { 375 t.Error("Expected non-zero confidence score") 376 } 377 378 if content.ExtractionMethod == "" { 379 t.Error("Expected extraction method to be set") 380 } 381 }) 382 383 t.Run("falls back to metadata extractor when XPath fails", func(t *testing.T) { 384 htmlContent := `<html><head> 385 <title>Metadata Fallback Test</title> 386 <meta property="og:author" content="Metadata Author"> 387 <meta property="article:published_time" content="2025-01-20"> 388 </head><body> 389 <h1 id="firstHeading">Fallback Test</h1> 390 <div id="bodyContent"> 391 <p>Content without author or date in XPath locations.</p> 392 </div> 393 </body></html>` 394 395 content, err := parser.Parse(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Metadata_Test") 396 if err != nil { 397 t.Fatalf("Expected no error, got %v", err) 398 } 399 400 if content.Author != "Metadata Author" { 401 t.Errorf("Expected metadata fallback for author, got %q", content.Author) 402 } 403 404 if content.Date != "2025-01-20" { 405 t.Errorf("Expected metadata fallback for date, got %q", content.Date) 406 } 407 }) 408 }) 409 410 t.Run("ParseURL", func(t *testing.T) { 411 parser, err := NewArticleParser(http.DefaultClient) 412 if err != nil { 413 t.Fatalf("Failed to create parser: %v", err) 414 } 415 416 localhostRule := &ParsingRule{ 417 Domain: "example.com", 418 Title: "//h1[@id='firstHeading']", 419 Body: "//div[@id='bodyContent']", 420 Strip: []string{"//div[@class='noprint']"}, 421 } 422 parser.AddRule("example.com", localhostRule) 423 424 const ( 425 validURL = "https://example.com/wiki/test" 426 httpErrorURL = "https://example.com/wiki/404" 427 unsupportedURL = "https://unsupported-domain.test/article" 428 ) 429 430 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 431 switch req.URL.String() { 432 case validURL: 433 return htmlResponse(http.StatusOK, `<html> 434 <head><title>Test Article</title></head> 435 <body> 436 <h1 id="firstHeading">Test Wikipedia Article</h1> 437 <div id="bodyContent"> 438 <p>This is the article content.</p> 439 <div class="noprint">This gets stripped</div> 440 </div> 441 </body> 442 </html>`), nil 443 case httpErrorURL: 444 return &http.Response{ 445 StatusCode: http.StatusNotFound, 446 Header: make(http.Header), 447 Body: io.NopCloser(strings.NewReader("")), 448 }, nil 449 case unsupportedURL: 450 return htmlResponse(http.StatusOK, `<html><head><title>Unsupported</title></head><body><p>Content</p></body></html>`), nil 451 default: 452 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 453 } 454 })) 455 456 t.Run("fails with invalid URL", func(t *testing.T) { 457 _, err := parser.ParseURL("not-a-url") 458 if err == nil { 459 t.Error("Expected error for invalid URL") 460 } 461 if !strings.Contains(err.Error(), "unsupported protocol scheme") && 462 !strings.Contains(err.Error(), "failed to fetch URL") && 463 !strings.Contains(err.Error(), "invalid URL") { 464 t.Errorf("Expected URL scheme error, got %v", err) 465 } 466 }) 467 468 t.Run("fails with unsupported domain", func(t *testing.T) { 469 _, err := parser.ParseURL(unsupportedURL) 470 if err == nil { 471 t.Error("Expected error for unsupported domain") 472 } 473 }) 474 475 t.Run("fails with HTTP error", func(t *testing.T) { 476 _, err := parser.ParseURL(httpErrorURL) 477 if err == nil { 478 t.Error("Expected error for HTTP 404") 479 } 480 }) 481 482 t.Run("successfully parses supported domain", func(t *testing.T) { 483 content, err := parser.ParseURL(validURL) 484 if err != nil { 485 t.Fatalf("Expected no error, got %v", err) 486 } 487 if content == nil { 488 t.Fatal("Expected parsed content, got nil") 489 } 490 if content.Title != "Test Wikipedia Article" { 491 t.Errorf("Expected title to be extracted, got %q", content.Title) 492 } 493 if !strings.Contains(content.Content, "This is the article content.") { 494 t.Errorf("Expected content to include article text, got %q", content.Content) 495 } 496 if strings.Contains(content.Content, "This gets stripped") { 497 t.Error("Expected strip rules to remove non-content nodes") 498 } 499 }) 500 501 }) 502 503 t.Run("SaveArticle", func(t *testing.T) { 504 parser := &ArticleParser{} 505 tempDir := t.TempDir() 506 507 content := &ParsedContent{ 508 Title: "Test Article", 509 Author: "Test Author", 510 Date: "2023-01-01", 511 Content: "This is test content.", 512 URL: "https://example.com/test", 513 } 514 515 t.Run("successfully saves article", func(t *testing.T) { 516 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) 517 if err != nil { 518 t.Fatalf("Expected no error, got %v", err) 519 } 520 521 if _, err := os.Stat(mdPath); os.IsNotExist(err) { 522 t.Error("Expected markdown file to exist") 523 } 524 if _, err := os.Stat(htmlPath); os.IsNotExist(err) { 525 t.Error("Expected HTML file to exist") 526 } 527 528 mdContent, err := os.ReadFile(mdPath) 529 if err != nil { 530 t.Fatalf("Failed to read markdown file: %v", err) 531 } 532 if !strings.Contains(string(mdContent), "# Test Article") { 533 t.Error("Expected markdown to contain title") 534 } 535 if !strings.Contains(string(mdContent), "**Author:** Test Author") { 536 t.Error("Expected markdown to contain author") 537 } 538 539 htmlContentBytes, err := os.ReadFile(htmlPath) 540 if err != nil { 541 t.Fatalf("Failed to read HTML file: %v", err) 542 } 543 if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") { 544 t.Error("Expected HTML to contain title") 545 } 546 }) 547 548 t.Run("handles duplicate filenames", func(t *testing.T) { 549 mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir) 550 if err != nil { 551 t.Fatalf("Expected no error for first save, got %v", err) 552 } 553 554 mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir) 555 if err != nil { 556 t.Fatalf("Expected no error for second save, got %v", err) 557 } 558 559 if mdPath1 == mdPath2 { 560 t.Error("Expected different markdown paths for duplicate saves") 561 } 562 if htmlPath1 == htmlPath2 { 563 t.Error("Expected different HTML paths for duplicate saves") 564 } 565 566 if _, err := os.Stat(mdPath1); os.IsNotExist(err) { 567 t.Error("Expected first markdown file to exist") 568 } 569 if _, err := os.Stat(mdPath2); os.IsNotExist(err) { 570 t.Error("Expected second markdown file to exist") 571 } 572 }) 573 574 t.Run("fails with invalid directory", func(t *testing.T) { 575 invalidDir := "/nonexistent/directory" 576 _, _, err := parser.SaveArticle(content, invalidDir) 577 if err == nil { 578 t.Error("Expected error for invalid directory") 579 } 580 }) 581 }) 582 583 t.Run("createHTML", func(t *testing.T) { 584 parser := &ArticleParser{} 585 content := &ParsedContent{ 586 Title: "Test HTML Article", 587 Author: "HTML Author", 588 Date: "2023-12-25", 589 Content: "This is **bold** content with *emphasis*.", 590 URL: "https://example.com/html-test", 591 } 592 593 t.Run("creates valid HTML", func(t *testing.T) { 594 markdown := parser.createMarkdown(content) 595 html := parser.createHTML(content, markdown) 596 597 if !strings.Contains(html, "<!DOCTYPE html>") { 598 t.Error("Expected HTML to contain DOCTYPE") 599 } 600 if !strings.Contains(html, "<title>Test HTML Article</title>") { 601 t.Error("Expected HTML to contain title") 602 } 603 if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") { 604 t.Error("Expected HTML to contain h1 heading with title") 605 } 606 if !strings.Contains(html, "<strong>bold</strong>") { 607 t.Error("Expected HTML to contain bold formatting") 608 } 609 if !strings.Contains(html, "<em>emphasis</em>") { 610 t.Error("Expected HTML to contain emphasis formatting") 611 } 612 }) 613 }) 614 615 t.Run("createMarkdown", func(t *testing.T) { 616 parser := &ArticleParser{} 617 618 t.Run("creates markdown with all fields", func(t *testing.T) { 619 content := &ParsedContent{ 620 Title: "Full Content Article", 621 Author: "Complete Author", 622 Date: "2023-01-15", 623 Content: "Complete article content here.", 624 URL: "https://example.com/full", 625 } 626 627 markdown := parser.createMarkdown(content) 628 629 if !strings.Contains(markdown, "# Full Content Article") { 630 t.Error("Expected markdown to contain title") 631 } 632 if !strings.Contains(markdown, "**Author:** Complete Author") { 633 t.Error("Expected markdown to contain author") 634 } 635 if !strings.Contains(markdown, "**Date:** 2023-01-15") { 636 t.Error("Expected markdown to contain date") 637 } 638 if !strings.Contains(markdown, "**Source:** https://example.com/full") { 639 t.Error("Expected markdown to contain source URL") 640 } 641 if !strings.Contains(markdown, "**Saved:**") { 642 t.Error("Expected markdown to contain saved timestamp") 643 } 644 if !strings.Contains(markdown, "---") { 645 t.Error("Expected markdown to contain separator") 646 } 647 if !strings.Contains(markdown, "Complete article content here.") { 648 t.Error("Expected markdown to contain article content") 649 } 650 }) 651 652 t.Run("creates markdown with minimal fields", func(t *testing.T) { 653 content := &ParsedContent{ 654 Title: "Minimal Article", 655 Content: "Just content.", 656 URL: "https://example.com/minimal", 657 } 658 659 markdown := parser.createMarkdown(content) 660 661 if !strings.Contains(markdown, "# Minimal Article") { 662 t.Error("Expected markdown to contain title") 663 } 664 if strings.Contains(markdown, "**Author:**") { 665 t.Error("Expected no author field for empty author") 666 } 667 if strings.Contains(markdown, "**Date:**") { 668 t.Error("Expected no date field for empty date") 669 } 670 if !strings.Contains(markdown, "**Source:** https://example.com/minimal") { 671 t.Error("Expected markdown to contain source URL") 672 } 673 }) 674 }) 675} 676 677func TestCreateArticleFromURL(t *testing.T) { 678 tempDir := t.TempDir() 679 680 t.Run("fails with invalid URL", func(t *testing.T) { 681 _, err := CreateArticleFromURL("not-a-url", tempDir) 682 if err == nil { 683 t.Error("Expected error for invalid URL") 684 } 685 if !strings.Contains(err.Error(), "invalid URL") && !strings.Contains(err.Error(), "failed to parse URL") { 686 t.Errorf("Expected URL parsing error, got %v", err) 687 } 688 }) 689 690 t.Run("fails with empty URL", func(t *testing.T) { 691 _, err := CreateArticleFromURL("", tempDir) 692 if err == nil { 693 t.Error("Expected error for empty URL") 694 } 695 }) 696 697 t.Run("fails with unsupported domain", func(t *testing.T) { 698 unsupportedURL := "https://unsupported-domain.test/article" 699 withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) { 700 if req.URL.String() == unsupportedURL { 701 return htmlResponse(http.StatusOK, "<html><body><div>Too little content</div></body></html>"), nil 702 } 703 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 704 }) 705 706 _, err := CreateArticleFromURL(unsupportedURL, tempDir) 707 if err == nil { 708 t.Error("Expected error for unsupported domain") 709 } 710 if !strings.Contains(err.Error(), "confidence too low") && 711 !strings.Contains(err.Error(), "could not extract title") { 712 t.Errorf("Expected heuristic extraction error, got %v", err) 713 } 714 }) 715 716 t.Run("fails with HTTP error", func(t *testing.T) { 717 errorURL := "https://example.com/missing" 718 withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) { 719 if req.URL.String() == errorURL { 720 return &http.Response{ 721 StatusCode: http.StatusNotFound, 722 Header: make(http.Header), 723 Body: io.NopCloser(strings.NewReader("")), 724 }, nil 725 } 726 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 727 }) 728 729 _, err := CreateArticleFromURL(errorURL, tempDir) 730 if err == nil { 731 t.Error("Expected error for HTTP 404") 732 } 733 if !strings.Contains(err.Error(), "HTTP error") && !strings.Contains(err.Error(), "404") { 734 t.Errorf("Expected HTTP error, got %v", err) 735 } 736 }) 737 738 t.Run("fails with network error", func(t *testing.T) { 739 networkURL := "https://example.com/network" 740 withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) { 741 if req.URL.String() == networkURL { 742 return nil, errors.New("dial error") 743 } 744 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 745 }) 746 747 _, err := CreateArticleFromURL(networkURL, tempDir) 748 if err == nil { 749 t.Error("Expected error for network failure") 750 } 751 if !strings.Contains(err.Error(), "failed to fetch URL") && !strings.Contains(err.Error(), "connection refused") { 752 t.Errorf("Expected network error, got %v", err) 753 } 754 }) 755 756 t.Run("fails with malformed HTML", func(t *testing.T) { 757 parser, err := NewArticleParser(http.DefaultClient) 758 if err != nil { 759 t.Fatalf("Failed to create parser: %v", err) 760 } 761 762 localhostRule := &ParsingRule{ 763 Domain: "example.com", 764 Title: "//h1[@id='firstHeading']", 765 Body: "//div[@id='bodyContent']", 766 Strip: []string{"//div[@class='noprint']"}, 767 } 768 parser.AddRule("example.com", localhostRule) 769 770 malformedURL := "https://example.com/malformed" 771 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 772 if req.URL.String() == malformedURL { 773 return htmlResponse(http.StatusOK, "<html><head><title>Test</head></body>"), nil 774 } 775 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 776 })) 777 778 _, err = parser.ParseURL(malformedURL) 779 if err == nil { 780 t.Error("Expected error for malformed HTML") 781 } 782 783 if !strings.Contains(err.Error(), "failed to parse HTML") && 784 !strings.Contains(err.Error(), "could not extract title") && 785 !strings.Contains(err.Error(), "could not extract body content") && 786 !strings.Contains(err.Error(), "confidence too low") { 787 t.Errorf("Expected HTML parsing or extraction error, got %v", err) 788 } 789 }) 790 791 t.Run("fails when no title can be extracted", func(t *testing.T) { 792 parser, err := NewArticleParser(http.DefaultClient) 793 if err != nil { 794 t.Fatalf("Failed to create parser: %v", err) 795 } 796 797 localhostRule := &ParsingRule{ 798 Domain: "example.com", 799 Title: "//h1[@id='firstHeading']", 800 Body: "//div[@id='bodyContent']", 801 Strip: []string{"//div[@class='noprint']"}, 802 } 803 parser.AddRule("example.com", localhostRule) 804 805 noTitleURL := "https://example.com/notitle" 806 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 807 if req.URL.String() == noTitleURL { 808 return htmlResponse(http.StatusOK, `<html> 809 <head><title>Test</title></head> 810 <body> 811 <div id="bodyContent"> 812 <p>Content without proper title</p> 813 </div> 814 </body> 815 </html>`), nil 816 } 817 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 818 })) 819 820 result, err := parser.ParseURL(noTitleURL) 821 822 if err != nil { 823 if !strings.Contains(err.Error(), "could not extract title") && 824 !strings.Contains(err.Error(), "confidence too low") { 825 t.Errorf("Expected title extraction error, got %v", err) 826 } 827 } else if result != nil { 828 if result.Title == "" { 829 t.Error("Expected title to be extracted via metadata fallback") 830 } 831 } 832 }) 833 834 t.Run("successfully creates article structure from parsed content", func(t *testing.T) { 835 wikipediaHTML := `<html> 836 <head><title>Integration Test Article</title></head> 837 <body> 838 <h1 id="firstHeading">Integration Test Article</h1> 839 <div id="bodyContent"> 840 <p>This is integration test content.</p> 841 <div class="noprint">This should be stripped</div> 842 <p>More content here.</p> 843 </div> 844 </body> 845 </html>` 846 847 parser, err := NewArticleParser(http.DefaultClient) 848 if err != nil { 849 t.Fatalf("Failed to create parser: %v", err) 850 } 851 852 localhostRule := &ParsingRule{ 853 Domain: "example.com", 854 Title: "//h1[@id='firstHeading']", 855 Body: "//div[@id='bodyContent']", 856 Strip: []string{"//div[@class='noprint']"}, 857 } 858 parser.AddRule("example.com", localhostRule) 859 860 contentURL := "https://example.com/integration" 861 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 862 if req.URL.String() == contentURL { 863 return htmlResponse(http.StatusOK, wikipediaHTML), nil 864 } 865 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 866 })) 867 868 content, err := parser.ParseURL(contentURL) 869 if err != nil { 870 t.Fatalf("Expected no error, got %v", err) 871 } 872 873 mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) 874 if err != nil { 875 t.Fatalf("Failed to save article: %v", err) 876 } 877 878 article := &models.Article{ 879 URL: contentURL, 880 Title: content.Title, 881 MarkdownPath: mdPath, 882 HTMLPath: htmlPath, 883 Created: time.Now(), 884 Modified: time.Now(), 885 } 886 887 if article.Title != "Integration Test Article" { 888 t.Errorf("Expected title 'Integration Test Article', got %s", article.Title) 889 } 890 if article.URL != contentURL { 891 t.Errorf("Expected URL %s, got %s", contentURL, article.URL) 892 } 893 if article.MarkdownPath == "" { 894 t.Error("Expected non-empty markdown path") 895 } 896 if article.HTMLPath == "" { 897 t.Error("Expected non-empty HTML path") 898 } 899 if article.Created.IsZero() { 900 t.Error("Expected Created timestamp to be set") 901 } 902 if article.Modified.IsZero() { 903 t.Error("Expected Modified timestamp to be set") 904 } 905 906 if _, err := os.Stat(article.MarkdownPath); os.IsNotExist(err) { 907 t.Error("Expected markdown file to exist") 908 } 909 if _, err := os.Stat(article.HTMLPath); os.IsNotExist(err) { 910 t.Error("Expected HTML file to exist") 911 } 912 913 mdContent, err := os.ReadFile(article.MarkdownPath) 914 if err != nil { 915 t.Fatalf("Failed to read markdown file: %v", err) 916 } 917 if !strings.Contains(string(mdContent), "# Integration Test Article") { 918 t.Error("Expected markdown to contain title") 919 } 920 if !strings.Contains(string(mdContent), "This is integration test content") { 921 t.Error("Expected markdown to contain article content") 922 } 923 if strings.Contains(string(mdContent), "This should be stripped") { 924 t.Error("Expected stripped content to be removed from markdown") 925 } 926 927 htmlContent, err := os.ReadFile(article.HTMLPath) 928 if err != nil { 929 t.Fatalf("Failed to read HTML file: %v", err) 930 } 931 if !strings.Contains(string(htmlContent), "<title>Integration Test Article</title>") { 932 t.Error("Expected HTML to contain title") 933 } 934 if !strings.Contains(string(htmlContent), "<!DOCTYPE html>") { 935 t.Error("Expected HTML to contain DOCTYPE") 936 } 937 }) 938 939 t.Run("successfully handles article with metadata", func(t *testing.T) { 940 contentHTML := `<html> 941 <head> 942 <title>Test Paper</title> 943 <meta name="citation_author" content="Dr. Test Author"> 944 <meta name="citation_date" content="2024-01-01"> 945 </head> 946 <body> 947 <h1 class="title">Test Research Paper</h1> 948 <blockquote class="abstract"> 949 <p>This is the abstract of the research paper.</p> 950 <p>It contains important research findings.</p> 951 </blockquote> 952 </body> 953 </html>` 954 955 parser, err := NewArticleParser(http.DefaultClient) 956 if err != nil { 957 t.Fatalf("Failed to create parser: %v", err) 958 } 959 960 localhostRule := &ParsingRule{ 961 Domain: "example.com", 962 Title: "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]", 963 Body: "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]", 964 Date: "//meta[@name='citation_date']/@content", 965 Author: "//meta[@name='citation_author']/@content", 966 } 967 parser.AddRule("example.com", localhostRule) 968 969 contentURL := "https://example.com/metadata" 970 parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 971 if req.URL.String() == contentURL { 972 return htmlResponse(http.StatusOK, contentHTML), nil 973 } 974 return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 975 })) 976 977 content, err := parser.ParseURL(contentURL) 978 if err != nil { 979 t.Fatalf("Expected no error, got %v", err) 980 } 981 982 if content.Title != "Test Research Paper" { 983 t.Errorf("Expected title 'Test Research Paper', got %s", content.Title) 984 } 985 if content.Author != "Dr. Test Author" { 986 t.Errorf("Expected author 'Dr. Test Author', got %s", content.Author) 987 } 988 if content.Date != "2024-01-01" { 989 t.Errorf("Expected date '2024-01-01', got %s", content.Date) 990 } 991 992 mdPath, _, err := parser.SaveArticle(content, tempDir) 993 if err != nil { 994 t.Fatalf("Failed to save article: %v", err) 995 } 996 997 mdContent, err := os.ReadFile(mdPath) 998 if err != nil { 999 t.Fatalf("Failed to read markdown file: %v", err) 1000 } 1001 if !strings.Contains(string(mdContent), "**Author:** Dr. Test Author") { 1002 t.Error("Expected markdown to contain author") 1003 } 1004 if !strings.Contains(string(mdContent), "**Date:** 2024-01-01") { 1005 t.Error("Expected markdown to contain date") 1006 } 1007 1008 article := &models.Article{ 1009 Author: content.Author, 1010 Date: content.Date, 1011 } 1012 1013 if article.Author != "Dr. Test Author" { 1014 t.Errorf("Expected article author 'Dr. Test Author', got %s", article.Author) 1015 } 1016 if article.Date != "2024-01-01" { 1017 t.Errorf("Expected article date '2024-01-01', got %s", article.Date) 1018 } 1019 }) 1020} 1021 1022type roundTripFunc func(*http.Request) (*http.Response, error) 1023 1024func (f roundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) { 1025 return f(req) 1026} 1027 1028func newMockHTTPClient(t *testing.T, fn roundTripFunc) *http.Client { 1029 t.Helper() 1030 return &http.Client{Transport: fn} 1031} 1032 1033func htmlResponse(status int, body string) *http.Response { 1034 return &http.Response{ 1035 StatusCode: status, 1036 Header: http.Header{"Content-Type": []string{"text/html; charset=utf-8"}}, 1037 Body: io.NopCloser(strings.NewReader(body)), 1038 } 1039} 1040 1041func withDefaultHTTPClient(t *testing.T, fn roundTripFunc) { 1042 t.Helper() 1043 original := http.DefaultClient.Transport 1044 http.DefaultClient.Transport = fn 1045 t.Cleanup(func() { 1046 http.DefaultClient.Transport = original 1047 }) 1048}