commit 86604060d38103f5fddb927757d706877d19c035 · desertthunder.dev/noteleaf

desertthunder.dev / noteleaf

cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃

charm leaflet readability golang

fork atom

build: add test cases for CreateArticleFromURL

- rename articles.go to parser.go to avoid redundancy

desertthunder.dev 4 months ago 86604060 06627862

+253 -10

2 changed files

expand all

unified split

internal

articles

parser.go

parser_test.go

internal/articles/articles.go internal/articles/parser.go

+253 -10

internal/articles/articles_test.go internal/articles/parser_test.go

··· 7 7 "os" 8 8 "strings" 9 9 "testing" 10 + "time" 11 + 12 + "github.com/stormlightlabs/noteleaf/internal/models" 10 13 ) 11 14 12 15 // ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules. ··· 492 495 if err == nil { 493 496 t.Error("Expected error for invalid URL") 494 497 } 498 + if !strings.Contains(err.Error(), "invalid URL") && !strings.Contains(err.Error(), "failed to parse URL") { 499 + t.Errorf("Expected URL parsing error, got %v", err) 500 + } 501 + }) 502 + 503 + t.Run("fails with empty URL", func(t *testing.T) { 504 + _, err := CreateArticleFromURL("", tempDir) 505 + if err == nil { 506 + t.Error("Expected error for empty URL") 507 + } 495 508 }) 496 509 497 510 t.Run("fails with unsupported domain", func(t *testing.T) { ··· 505 518 if err == nil { 506 519 t.Error("Expected error for unsupported domain") 507 520 } 521 + if !strings.Contains(err.Error(), "no parsing rule found") { 522 + t.Errorf("Expected 'no parsing rule found' error, got %v", err) 523 + } 508 524 }) 509 525 510 - t.Run("successfully creates article from Wikipedia-like URL", func(t *testing.T) { 526 + t.Run("fails with HTTP error", func(t *testing.T) { 527 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 528 + w.WriteHeader(http.StatusNotFound) 529 + })) 530 + defer server.Close() 531 + 532 + // Use a direct Wikipedia URL that would be processed by the real function 533 + _, err := CreateArticleFromURL("https://en.wikipedia.org/wiki/NonExistentPage12345", tempDir) 534 + if err == nil { 535 + t.Error("Expected error for HTTP 404") 536 + } 537 + if !strings.Contains(err.Error(), "HTTP error") && !strings.Contains(err.Error(), "404") { 538 + t.Errorf("Expected HTTP error, got %v", err) 539 + } 540 + }) 541 + 542 + t.Run("fails with network error", func(t *testing.T) { 543 + // Use a non-existent server to trigger network error 544 + _, err := CreateArticleFromURL("http://localhost:99999/test", tempDir) 545 + if err == nil { 546 + t.Error("Expected error for network failure") 547 + } 548 + if !strings.Contains(err.Error(), "failed to fetch URL") && !strings.Contains(err.Error(), "connection refused") { 549 + t.Errorf("Expected network error, got %v", err) 550 + } 551 + }) 552 + 553 + t.Run("fails with invalid directory", func(t *testing.T) { 554 + // Skip this test as it would require network access to test with real URLs 555 + t.Skip("Skipping invalid directory test - requires network access") 556 + }) 557 + 558 + t.Run("fails with malformed HTML", func(t *testing.T) { 559 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 560 + w.WriteHeader(http.StatusOK) 561 + w.Write([]byte("<html><head><title>Test</head></body>")) // Malformed HTML 562 + })) 563 + defer server.Close() 564 + 565 + // Create a custom parser with localhost rule for testing 566 + parser, err := NewArticleParser(server.Client()) 567 + if err != nil { 568 + t.Fatalf("Failed to create parser: %v", err) 569 + } 570 + 571 + localhostRule := &ParsingRule{ 572 + Domain: "127.0.0.1", 573 + Title: "//h1[@id='firstHeading']", 574 + Body: "//div[@id='bodyContent']", 575 + Strip: []string{"//div[@class='noprint']"}, 576 + } 577 + parser.AddRule("127.0.0.1", localhostRule) 578 + 579 + _, err = parser.ParseURL(server.URL) 580 + if err == nil { 581 + t.Error("Expected error for malformed HTML") 582 + } 583 + // Malformed HTML may either fail to parse or fail to extract title 584 + if !strings.Contains(err.Error(), "failed to parse HTML") && !strings.Contains(err.Error(), "could not extract title") { 585 + t.Errorf("Expected HTML parsing or title extraction error, got %v", err) 586 + } 587 + }) 588 + 589 + t.Run("fails when no title can be extracted", func(t *testing.T) { 590 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 591 + w.WriteHeader(http.StatusOK) 592 + w.Write([]byte(`<html> 593 + <head><title>Test</title></head> 594 + <body> 595 + <div id="bodyContent"> 596 + <p>Content without proper title</p> 597 + </div> 598 + </body> 599 + </html>`)) // No h1 with id="firstHeading" 600 + })) 601 + defer server.Close() 602 + 603 + // Create a custom parser with localhost rule for testing 604 + parser, err := NewArticleParser(server.Client()) 605 + if err != nil { 606 + t.Fatalf("Failed to create parser: %v", err) 607 + } 608 + 609 + localhostRule := &ParsingRule{ 610 + Domain: "127.0.0.1", 611 + Title: "//h1[@id='firstHeading']", 612 + Body: "//div[@id='bodyContent']", 613 + Strip: []string{"//div[@class='noprint']"}, 614 + } 615 + parser.AddRule("127.0.0.1", localhostRule) 616 + 617 + _, err = parser.ParseURL(server.URL) 618 + if err == nil { 619 + t.Error("Expected error when no title can be extracted") 620 + } 621 + if !strings.Contains(err.Error(), "could not extract title") { 622 + t.Errorf("Expected 'could not extract title' error, got %v", err) 623 + } 624 + }) 625 + 626 + t.Run("successfully creates article structure from parsed content", func(t *testing.T) { 511 627 wikipediaHTML := `<html> 512 628 <head><title>Integration Test Article</title></head> 513 629 <body> 514 630 <h1 id="firstHeading">Integration Test Article</h1> 515 631 <div id="bodyContent"> 516 632 <p>This is integration test content.</p> 633 + <div class="noprint">This should be stripped</div> 634 + <p>More content here.</p> 517 635 </div> 518 636 </body> 519 637 </html>` ··· 524 642 })) 525 643 defer server.Close() 526 644 527 - // We need to patch the CreateArticleFromURL function to use our test client and rules 528 - // For now, let's test the components individually since CreateArticleFromURL uses NewArticleParser internally 645 + // Create a custom parser with localhost rule for testing 529 646 parser, err := NewArticleParser(server.Client()) 530 647 if err != nil { 531 648 t.Fatalf("Failed to create parser: %v", err) 532 649 } 533 650 534 - // Add localhost rule for testing 535 651 localhostRule := &ParsingRule{ 536 652 Domain: "127.0.0.1", 537 653 Title: "//h1[@id='firstHeading']", ··· 550 666 t.Fatalf("Failed to save article: %v", err) 551 667 } 552 668 553 - if content.Title != "Integration Test Article" { 554 - t.Errorf("Expected title 'Integration Test Article', got %s", content.Title) 669 + // Test that it creates a proper models.Article structure (simulating CreateArticleFromURL) 670 + article := &models.Article{ 671 + URL: server.URL, 672 + Title: content.Title, 673 + MarkdownPath: mdPath, 674 + HTMLPath: htmlPath, 675 + Created: time.Now(), 676 + Modified: time.Now(), 555 677 } 556 - if mdPath == "" { 678 + 679 + if article.Title != "Integration Test Article" { 680 + t.Errorf("Expected title 'Integration Test Article', got %s", article.Title) 681 + } 682 + if article.URL != server.URL { 683 + t.Errorf("Expected URL %s, got %s", server.URL, article.URL) 684 + } 685 + if article.MarkdownPath == "" { 557 686 t.Error("Expected non-empty markdown path") 558 687 } 559 - if htmlPath == "" { 688 + if article.HTMLPath == "" { 560 689 t.Error("Expected non-empty HTML path") 561 690 } 691 + if article.Created.IsZero() { 692 + t.Error("Expected Created timestamp to be set") 693 + } 694 + if article.Modified.IsZero() { 695 + t.Error("Expected Modified timestamp to be set") 696 + } 562 697 563 698 // Check files exist 564 - if _, err := os.Stat(mdPath); os.IsNotExist(err) { 699 + if _, err := os.Stat(article.MarkdownPath); os.IsNotExist(err) { 565 700 t.Error("Expected markdown file to exist") 566 701 } 567 - if _, err := os.Stat(htmlPath); os.IsNotExist(err) { 702 + if _, err := os.Stat(article.HTMLPath); os.IsNotExist(err) { 568 703 t.Error("Expected HTML file to exist") 704 + } 705 + 706 + // Verify file contents 707 + mdContent, err := os.ReadFile(article.MarkdownPath) 708 + if err != nil { 709 + t.Fatalf("Failed to read markdown file: %v", err) 710 + } 711 + if !strings.Contains(string(mdContent), "# Integration Test Article") { 712 + t.Error("Expected markdown to contain title") 713 + } 714 + if !strings.Contains(string(mdContent), "This is integration test content") { 715 + t.Error("Expected markdown to contain article content") 716 + } 717 + if strings.Contains(string(mdContent), "This should be stripped") { 718 + t.Error("Expected stripped content to be removed from markdown") 719 + } 720 + 721 + htmlContent, err := os.ReadFile(article.HTMLPath) 722 + if err != nil { 723 + t.Fatalf("Failed to read HTML file: %v", err) 724 + } 725 + if !strings.Contains(string(htmlContent), "<title>Integration Test Article</title>") { 726 + t.Error("Expected HTML to contain title") 727 + } 728 + if !strings.Contains(string(htmlContent), "<!DOCTYPE html>") { 729 + t.Error("Expected HTML to contain DOCTYPE") 730 + } 731 + }) 732 + 733 + t.Run("successfully handles article with metadata", func(t *testing.T) { 734 + contentHTML := `<html> 735 + <head> 736 + <title>Test Paper</title> 737 + <meta name="citation_author" content="Dr. Test Author"> 738 + <meta name="citation_date" content="2024-01-01"> 739 + </head> 740 + <body> 741 + <h1 class="title">Test Research Paper</h1> 742 + <blockquote class="abstract"> 743 + <p>This is the abstract of the research paper.</p> 744 + <p>It contains important research findings.</p> 745 + </blockquote> 746 + </body> 747 + </html>` 748 + 749 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 750 + w.WriteHeader(http.StatusOK) 751 + w.Write([]byte(contentHTML)) 752 + })) 753 + defer server.Close() 754 + 755 + // Create a custom parser with arXiv-like rule for testing 756 + parser, err := NewArticleParser(server.Client()) 757 + if err != nil { 758 + t.Fatalf("Failed to create parser: %v", err) 759 + } 760 + 761 + localhostRule := &ParsingRule{ 762 + Domain: "127.0.0.1", 763 + Title: "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]", 764 + Body: "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]", 765 + Date: "//meta[@name='citation_date']/@content", 766 + Author: "//meta[@name='citation_author']/@content", 767 + } 768 + parser.AddRule("127.0.0.1", localhostRule) 769 + 770 + content, err := parser.ParseURL(server.URL) 771 + if err != nil { 772 + t.Fatalf("Expected no error, got %v", err) 773 + } 774 + 775 + if content.Title != "Test Research Paper" { 776 + t.Errorf("Expected title 'Test Research Paper', got %s", content.Title) 777 + } 778 + if content.Author != "Dr. Test Author" { 779 + t.Errorf("Expected author 'Dr. Test Author', got %s", content.Author) 780 + } 781 + if content.Date != "2024-01-01" { 782 + t.Errorf("Expected date '2024-01-01', got %s", content.Date) 783 + } 784 + 785 + mdPath, _, err := parser.SaveArticle(content, tempDir) 786 + if err != nil { 787 + t.Fatalf("Failed to save article: %v", err) 788 + } 789 + 790 + // Verify markdown contains all metadata 791 + mdContent, err := os.ReadFile(mdPath) 792 + if err != nil { 793 + t.Fatalf("Failed to read markdown file: %v", err) 794 + } 795 + if !strings.Contains(string(mdContent), "**Author:** Dr. Test Author") { 796 + t.Error("Expected markdown to contain author") 797 + } 798 + if !strings.Contains(string(mdContent), "**Date:** 2024-01-01") { 799 + t.Error("Expected markdown to contain date") 800 + } 801 + 802 + article := &models.Article{ 803 + Author: content.Author, 804 + Date: content.Date, 805 + } 806 + 807 + if article.Author != "Dr. Test Author" { 808 + t.Errorf("Expected article author 'Dr. Test Author', got %s", article.Author) 809 + } 810 + if article.Date != "2024-01-01" { 811 + t.Errorf("Expected article date '2024-01-01', got %s", article.Date) 569 812 } 570 813 }) 571 814 }