package articles
import (
"errors"
"fmt"
"io"
"net/http"
"os"
"strings"
"testing"
"time"
"github.com/stormlightlabs/noteleaf/internal/models"
)
// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
func ExampleParser_Convert() {
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
fmt.Printf("Failed to create parser: %v\n", err)
return
}
htmlPath := "examples/christopher-lloyd.html"
htmlContent, err := os.ReadFile(htmlPath)
if err != nil {
fmt.Printf("Local HTML file not found: %v\n", err)
return
}
markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd")
if err != nil {
fmt.Printf("Failed to convert HTML: %v\n", err)
return
}
parts := strings.Split(markdown, "\n---\n")
if len(parts) > 0 {
frontmatter := strings.TrimSpace(parts[0])
lines := strings.Split(frontmatter, "\n")
for i, line := range lines {
if i >= 4 {
break
}
if !strings.Contains(line, "**Saved:**") {
fmt.Println(line)
}
}
}
// Output: # Christopher Lloyd
//
// **Author:** Contributors to Wikimedia projects
}
func TestArticleParser(t *testing.T) {
t.Run("New", func(t *testing.T) {
t.Run("successfully creates parser", func(t *testing.T) {
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if parser == nil {
t.Fatal("Expected parser to be created, got nil")
}
if len(parser.rules) == 0 {
t.Error("Expected rules to be loaded")
}
})
t.Run("loads expected domains", func(t *testing.T) {
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
domains := parser.GetSupportedDomains()
expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"}
if len(domains) != len(expectedDomains) {
t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains))
}
domainMap := make(map[string]bool)
for _, domain := range domains {
domainMap[domain] = true
}
for _, expected := range expectedDomains {
if !domainMap[expected] {
t.Errorf("Expected domain %s not found in supported domains", expected)
}
}
})
})
t.Run("parseRules", func(t *testing.T) {
parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
t.Run("parses valid rule file", func(t *testing.T) {
content := `title: //h1
author: //span[@class='author']
date: //time
body: //article
strip: //nav
strip: //footer
test_url: https://example.com/article`
rule, err := parser.parseRules("example.com", content)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if rule.Domain != "example.com" {
t.Errorf("Expected domain 'example.com', got %s", rule.Domain)
}
if rule.Title != "//h1" {
t.Errorf("Expected title '//h1', got %s", rule.Title)
}
if rule.Author != "//span[@class='author']" {
t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author)
}
if len(rule.Strip) != 2 {
t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip))
}
if len(rule.TestURLs) != 1 {
t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs))
}
})
t.Run("handles empty lines and comments", func(t *testing.T) {
content := `# This is a comment
title: //h1
# Another comment
body: //article
`
rule, err := parser.parseRules("test.com", content)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if rule.Title != "//h1" {
t.Errorf("Expected title '//h1', got %s", rule.Title)
}
if rule.Body != "//article" {
t.Errorf("Expected body '//article', got %s", rule.Body)
}
})
})
t.Run("slugify", func(t *testing.T) {
parser := &ArticleParser{}
tc := []struct {
input string
expected string
}{
{"Simple Title", "simple-title"},
{"Title with Numbers 123", "title-with-numbers-123"},
{"Title-with-Hyphens", "title-with-hyphens"},
{"Title with Spaces and Multiple Spaces", "title-with-spaces-and-multiple-spaces"},
{"Title!@#$%^&*()with Special Characters", "title-with-special-characters"},
{"", ""},
{strings.Repeat("a", 150), strings.Repeat("a", 100)},
}
for _, tt := range tc {
t.Run(fmt.Sprintf("slugify '%s'", tt.input), func(t *testing.T) {
result := parser.slugify(tt.input)
if result != tt.expected {
t.Errorf("Expected '%s', got '%s'", tt.expected, result)
}
})
}
})
t.Run("Convert", func(t *testing.T) {
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
t.Run("fails with unsupported domain", func(t *testing.T) {
htmlContent := "
TestContent
"
_, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article")
if err == nil {
t.Error("Expected error for unsupported domain")
}
if !strings.Contains(err.Error(), "confidence too low") &&
!strings.Contains(err.Error(), "could not extract title") {
t.Errorf("Expected heuristic extraction error, got %v", err)
}
})
t.Run("fails with invalid HTML", func(t *testing.T) {
invalidHTML := "Test"
_, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
if err == nil {
t.Error("Expected error for invalid HTML")
}
})
t.Run("fails when no title extracted", func(t *testing.T) {
htmlContent := "TestContent
"
_, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
if err == nil {
t.Error("Expected error when no title can be extracted")
}
if !strings.Contains(err.Error(), "could not extract title") &&
!strings.Contains(err.Error(), "could not extract body content") &&
!strings.Contains(err.Error(), "confidence too low") {
t.Errorf("Expected title, body, or confidence error, got %v", err)
}
})
t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) {
htmlContent := `
Test Article
Test Article Title
This is the main content of the article.
This should be stripped
Edit this section
More content here.
`
markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if !strings.Contains(markdown, "# Test Article Title") {
t.Error("Expected markdown to contain title")
}
if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") {
t.Error("Expected markdown to contain source URL")
}
if !strings.Contains(markdown, "This is the main content") {
t.Error("Expected markdown to contain article content")
}
if strings.Contains(markdown, "This should be stripped") {
t.Error("Expected stripped content to be removed from markdown")
}
if strings.Contains(markdown, ".mw-parser-output") {
t.Error("Expected style content to be removed from markdown")
}
if strings.Contains(markdown, "Edit this section") {
t.Error("Expected edit section markers to be removed from markdown")
}
})
t.Run("strips Wikipedia navigation boxes and metadata", func(t *testing.T) {
htmlContent := `
Test Navigation Article
Test Navigation Article
Main article content goes here.
Section One[edit]
Section content.
| Navigation item 1 |
| Navigation item 2 |
More article content.
Categories: Test Category
`
markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test_Navigation")
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if !strings.Contains(markdown, "Main article content") {
t.Error("Expected markdown to contain main article content")
}
if !strings.Contains(markdown, "Section content") {
t.Error("Expected markdown to contain section content")
}
if !strings.Contains(markdown, "More article content") {
t.Error("Expected markdown to contain additional content")
}
if strings.Contains(markdown, "Navigation item") {
t.Error("Expected navbox table content to be stripped")
}
if strings.Contains(markdown, "Another navigation box") {
t.Error("Expected navbox div content to be stripped")
}
if strings.Contains(markdown, "Vertical nav item") {
t.Error("Expected vertical-navbox content to be stripped")
}
if strings.Contains(markdown, "[edit]") {
t.Error("Expected edit section markers to be stripped")
}
if strings.Contains(markdown, "Navigation content") {
t.Error("Expected role=navigation content to be stripped")
}
if strings.Contains(markdown, "Categories:") {
t.Error("Expected category links to be stripped")
}
if strings.Contains(markdown, "Retrieved from") {
t.Error("Expected footer content to be stripped")
}
})
t.Run("uses heuristic extraction for unsupported domain with semantic HTML", func(t *testing.T) {
htmlContent := `
Heuristic Test Article
This is a substantial article that should be extracted using heuristic methods.
It contains multiple paragraphs with sufficient content for the readability algorithm.
The heuristic extractor should successfully identify this as main content.
`
markdown, err := parser.Convert(htmlContent, "unsupported-domain.com", "https://unsupported-domain.com/article")
if err == nil {
if !strings.Contains(markdown, "substantial article") {
t.Error("Expected markdown to contain extracted content")
}
}
})
t.Run("includes confidence score in parsed content", func(t *testing.T) {
htmlContent := `
Confidence Test
Confidence Test Article
Article content for confidence testing.
`
content, err := parser.Parse(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Confidence")
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if content.Confidence == 0.0 {
t.Error("Expected non-zero confidence score")
}
if content.ExtractionMethod == "" {
t.Error("Expected extraction method to be set")
}
})
t.Run("falls back to metadata extractor when XPath fails", func(t *testing.T) {
htmlContent := `
Metadata Fallback Test
Fallback Test
Content without author or date in XPath locations.
`
content, err := parser.Parse(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Metadata_Test")
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if content.Author != "Metadata Author" {
t.Errorf("Expected metadata fallback for author, got %q", content.Author)
}
if content.Date != "2025-01-20" {
t.Errorf("Expected metadata fallback for date, got %q", content.Date)
}
})
})
t.Run("ParseURL", func(t *testing.T) {
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
localhostRule := &ParsingRule{
Domain: "example.com",
Title: "//h1[@id='firstHeading']",
Body: "//div[@id='bodyContent']",
Strip: []string{"//div[@class='noprint']"},
}
parser.AddRule("example.com", localhostRule)
const (
validURL = "https://example.com/wiki/test"
httpErrorURL = "https://example.com/wiki/404"
unsupportedURL = "https://unsupported-domain.test/article"
)
parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
switch req.URL.String() {
case validURL:
return htmlResponse(http.StatusOK, `
Test Article
Test Wikipedia Article
This is the article content.
This gets stripped
`), nil
case httpErrorURL:
return &http.Response{
StatusCode: http.StatusNotFound,
Header: make(http.Header),
Body: io.NopCloser(strings.NewReader("")),
}, nil
case unsupportedURL:
return htmlResponse(http.StatusOK, `UnsupportedContent
`), nil
default:
return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
}
}))
t.Run("fails with invalid URL", func(t *testing.T) {
_, err := parser.ParseURL("not-a-url")
if err == nil {
t.Error("Expected error for invalid URL")
}
if !strings.Contains(err.Error(), "unsupported protocol scheme") &&
!strings.Contains(err.Error(), "failed to fetch URL") &&
!strings.Contains(err.Error(), "invalid URL") {
t.Errorf("Expected URL scheme error, got %v", err)
}
})
t.Run("fails with unsupported domain", func(t *testing.T) {
_, err := parser.ParseURL(unsupportedURL)
if err == nil {
t.Error("Expected error for unsupported domain")
}
})
t.Run("fails with HTTP error", func(t *testing.T) {
_, err := parser.ParseURL(httpErrorURL)
if err == nil {
t.Error("Expected error for HTTP 404")
}
})
t.Run("successfully parses supported domain", func(t *testing.T) {
content, err := parser.ParseURL(validURL)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if content == nil {
t.Fatal("Expected parsed content, got nil")
}
if content.Title != "Test Wikipedia Article" {
t.Errorf("Expected title to be extracted, got %q", content.Title)
}
if !strings.Contains(content.Content, "This is the article content.") {
t.Errorf("Expected content to include article text, got %q", content.Content)
}
if strings.Contains(content.Content, "This gets stripped") {
t.Error("Expected strip rules to remove non-content nodes")
}
})
})
t.Run("SaveArticle", func(t *testing.T) {
parser := &ArticleParser{}
tempDir := t.TempDir()
content := &ParsedContent{
Title: "Test Article",
Author: "Test Author",
Date: "2023-01-01",
Content: "This is test content.",
URL: "https://example.com/test",
}
t.Run("successfully saves article", func(t *testing.T) {
mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if _, err := os.Stat(mdPath); os.IsNotExist(err) {
t.Error("Expected markdown file to exist")
}
if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
t.Error("Expected HTML file to exist")
}
mdContent, err := os.ReadFile(mdPath)
if err != nil {
t.Fatalf("Failed to read markdown file: %v", err)
}
if !strings.Contains(string(mdContent), "# Test Article") {
t.Error("Expected markdown to contain title")
}
if !strings.Contains(string(mdContent), "**Author:** Test Author") {
t.Error("Expected markdown to contain author")
}
htmlContentBytes, err := os.ReadFile(htmlPath)
if err != nil {
t.Fatalf("Failed to read HTML file: %v", err)
}
if !strings.Contains(string(htmlContentBytes), "Test Article") {
t.Error("Expected HTML to contain title")
}
})
t.Run("handles duplicate filenames", func(t *testing.T) {
mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir)
if err != nil {
t.Fatalf("Expected no error for first save, got %v", err)
}
mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir)
if err != nil {
t.Fatalf("Expected no error for second save, got %v", err)
}
if mdPath1 == mdPath2 {
t.Error("Expected different markdown paths for duplicate saves")
}
if htmlPath1 == htmlPath2 {
t.Error("Expected different HTML paths for duplicate saves")
}
if _, err := os.Stat(mdPath1); os.IsNotExist(err) {
t.Error("Expected first markdown file to exist")
}
if _, err := os.Stat(mdPath2); os.IsNotExist(err) {
t.Error("Expected second markdown file to exist")
}
})
t.Run("fails with invalid directory", func(t *testing.T) {
invalidDir := "/nonexistent/directory"
_, _, err := parser.SaveArticle(content, invalidDir)
if err == nil {
t.Error("Expected error for invalid directory")
}
})
})
t.Run("createHTML", func(t *testing.T) {
parser := &ArticleParser{}
content := &ParsedContent{
Title: "Test HTML Article",
Author: "HTML Author",
Date: "2023-12-25",
Content: "This is **bold** content with *emphasis*.",
URL: "https://example.com/html-test",
}
t.Run("creates valid HTML", func(t *testing.T) {
markdown := parser.createMarkdown(content)
html := parser.createHTML(content, markdown)
if !strings.Contains(html, "") {
t.Error("Expected HTML to contain DOCTYPE")
}
if !strings.Contains(html, "Test HTML Article") {
t.Error("Expected HTML to contain title")
}
if !strings.Contains(html, "bold") {
t.Error("Expected HTML to contain bold formatting")
}
if !strings.Contains(html, "emphasis") {
t.Error("Expected HTML to contain emphasis formatting")
}
})
})
t.Run("createMarkdown", func(t *testing.T) {
parser := &ArticleParser{}
t.Run("creates markdown with all fields", func(t *testing.T) {
content := &ParsedContent{
Title: "Full Content Article",
Author: "Complete Author",
Date: "2023-01-15",
Content: "Complete article content here.",
URL: "https://example.com/full",
}
markdown := parser.createMarkdown(content)
if !strings.Contains(markdown, "# Full Content Article") {
t.Error("Expected markdown to contain title")
}
if !strings.Contains(markdown, "**Author:** Complete Author") {
t.Error("Expected markdown to contain author")
}
if !strings.Contains(markdown, "**Date:** 2023-01-15") {
t.Error("Expected markdown to contain date")
}
if !strings.Contains(markdown, "**Source:** https://example.com/full") {
t.Error("Expected markdown to contain source URL")
}
if !strings.Contains(markdown, "**Saved:**") {
t.Error("Expected markdown to contain saved timestamp")
}
if !strings.Contains(markdown, "---") {
t.Error("Expected markdown to contain separator")
}
if !strings.Contains(markdown, "Complete article content here.") {
t.Error("Expected markdown to contain article content")
}
})
t.Run("creates markdown with minimal fields", func(t *testing.T) {
content := &ParsedContent{
Title: "Minimal Article",
Content: "Just content.",
URL: "https://example.com/minimal",
}
markdown := parser.createMarkdown(content)
if !strings.Contains(markdown, "# Minimal Article") {
t.Error("Expected markdown to contain title")
}
if strings.Contains(markdown, "**Author:**") {
t.Error("Expected no author field for empty author")
}
if strings.Contains(markdown, "**Date:**") {
t.Error("Expected no date field for empty date")
}
if !strings.Contains(markdown, "**Source:** https://example.com/minimal") {
t.Error("Expected markdown to contain source URL")
}
})
})
}
func TestCreateArticleFromURL(t *testing.T) {
tempDir := t.TempDir()
t.Run("fails with invalid URL", func(t *testing.T) {
_, err := CreateArticleFromURL("not-a-url", tempDir)
if err == nil {
t.Error("Expected error for invalid URL")
}
if !strings.Contains(err.Error(), "invalid URL") && !strings.Contains(err.Error(), "failed to parse URL") {
t.Errorf("Expected URL parsing error, got %v", err)
}
})
t.Run("fails with empty URL", func(t *testing.T) {
_, err := CreateArticleFromURL("", tempDir)
if err == nil {
t.Error("Expected error for empty URL")
}
})
t.Run("fails with unsupported domain", func(t *testing.T) {
unsupportedURL := "https://unsupported-domain.test/article"
withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) {
if req.URL.String() == unsupportedURL {
return htmlResponse(http.StatusOK, "
Too little content
"), nil
}
return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
})
_, err := CreateArticleFromURL(unsupportedURL, tempDir)
if err == nil {
t.Error("Expected error for unsupported domain")
}
if !strings.Contains(err.Error(), "confidence too low") &&
!strings.Contains(err.Error(), "could not extract title") {
t.Errorf("Expected heuristic extraction error, got %v", err)
}
})
t.Run("fails with HTTP error", func(t *testing.T) {
errorURL := "https://example.com/missing"
withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) {
if req.URL.String() == errorURL {
return &http.Response{
StatusCode: http.StatusNotFound,
Header: make(http.Header),
Body: io.NopCloser(strings.NewReader("")),
}, nil
}
return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
})
_, err := CreateArticleFromURL(errorURL, tempDir)
if err == nil {
t.Error("Expected error for HTTP 404")
}
if !strings.Contains(err.Error(), "HTTP error") && !strings.Contains(err.Error(), "404") {
t.Errorf("Expected HTTP error, got %v", err)
}
})
t.Run("fails with network error", func(t *testing.T) {
networkURL := "https://example.com/network"
withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) {
if req.URL.String() == networkURL {
return nil, errors.New("dial error")
}
return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
})
_, err := CreateArticleFromURL(networkURL, tempDir)
if err == nil {
t.Error("Expected error for network failure")
}
if !strings.Contains(err.Error(), "failed to fetch URL") && !strings.Contains(err.Error(), "connection refused") {
t.Errorf("Expected network error, got %v", err)
}
})
t.Run("fails with malformed HTML", func(t *testing.T) {
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
localhostRule := &ParsingRule{
Domain: "example.com",
Title: "//h1[@id='firstHeading']",
Body: "//div[@id='bodyContent']",
Strip: []string{"//div[@class='noprint']"},
}
parser.AddRule("example.com", localhostRule)
malformedURL := "https://example.com/malformed"
parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
if req.URL.String() == malformedURL {
return htmlResponse(http.StatusOK, "Test"), nil
}
return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
}))
_, err = parser.ParseURL(malformedURL)
if err == nil {
t.Error("Expected error for malformed HTML")
}
if !strings.Contains(err.Error(), "failed to parse HTML") &&
!strings.Contains(err.Error(), "could not extract title") &&
!strings.Contains(err.Error(), "could not extract body content") &&
!strings.Contains(err.Error(), "confidence too low") {
t.Errorf("Expected HTML parsing or extraction error, got %v", err)
}
})
t.Run("fails when no title can be extracted", func(t *testing.T) {
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
localhostRule := &ParsingRule{
Domain: "example.com",
Title: "//h1[@id='firstHeading']",
Body: "//div[@id='bodyContent']",
Strip: []string{"//div[@class='noprint']"},
}
parser.AddRule("example.com", localhostRule)
noTitleURL := "https://example.com/notitle"
parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
if req.URL.String() == noTitleURL {
return htmlResponse(http.StatusOK, `
Test
Content without proper title
`), nil
}
return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
}))
result, err := parser.ParseURL(noTitleURL)
if err != nil {
if !strings.Contains(err.Error(), "could not extract title") &&
!strings.Contains(err.Error(), "confidence too low") {
t.Errorf("Expected title extraction error, got %v", err)
}
} else if result != nil {
if result.Title == "" {
t.Error("Expected title to be extracted via metadata fallback")
}
}
})
t.Run("successfully creates article structure from parsed content", func(t *testing.T) {
wikipediaHTML := `
Integration Test Article
Integration Test Article
This is integration test content.
This should be stripped
More content here.
`
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
localhostRule := &ParsingRule{
Domain: "example.com",
Title: "//h1[@id='firstHeading']",
Body: "//div[@id='bodyContent']",
Strip: []string{"//div[@class='noprint']"},
}
parser.AddRule("example.com", localhostRule)
contentURL := "https://example.com/integration"
parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
if req.URL.String() == contentURL {
return htmlResponse(http.StatusOK, wikipediaHTML), nil
}
return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
}))
content, err := parser.ParseURL(contentURL)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
if err != nil {
t.Fatalf("Failed to save article: %v", err)
}
article := &models.Article{
URL: contentURL,
Title: content.Title,
MarkdownPath: mdPath,
HTMLPath: htmlPath,
Created: time.Now(),
Modified: time.Now(),
}
if article.Title != "Integration Test Article" {
t.Errorf("Expected title 'Integration Test Article', got %s", article.Title)
}
if article.URL != contentURL {
t.Errorf("Expected URL %s, got %s", contentURL, article.URL)
}
if article.MarkdownPath == "" {
t.Error("Expected non-empty markdown path")
}
if article.HTMLPath == "" {
t.Error("Expected non-empty HTML path")
}
if article.Created.IsZero() {
t.Error("Expected Created timestamp to be set")
}
if article.Modified.IsZero() {
t.Error("Expected Modified timestamp to be set")
}
if _, err := os.Stat(article.MarkdownPath); os.IsNotExist(err) {
t.Error("Expected markdown file to exist")
}
if _, err := os.Stat(article.HTMLPath); os.IsNotExist(err) {
t.Error("Expected HTML file to exist")
}
mdContent, err := os.ReadFile(article.MarkdownPath)
if err != nil {
t.Fatalf("Failed to read markdown file: %v", err)
}
if !strings.Contains(string(mdContent), "# Integration Test Article") {
t.Error("Expected markdown to contain title")
}
if !strings.Contains(string(mdContent), "This is integration test content") {
t.Error("Expected markdown to contain article content")
}
if strings.Contains(string(mdContent), "This should be stripped") {
t.Error("Expected stripped content to be removed from markdown")
}
htmlContent, err := os.ReadFile(article.HTMLPath)
if err != nil {
t.Fatalf("Failed to read HTML file: %v", err)
}
if !strings.Contains(string(htmlContent), "Integration Test Article") {
t.Error("Expected HTML to contain title")
}
if !strings.Contains(string(htmlContent), "") {
t.Error("Expected HTML to contain DOCTYPE")
}
})
t.Run("successfully handles article with metadata", func(t *testing.T) {
contentHTML := `
Test Paper
Test Research Paper
This is the abstract of the research paper.
It contains important research findings.
`
parser, err := NewArticleParser(http.DefaultClient)
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
localhostRule := &ParsingRule{
Domain: "example.com",
Title: "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]",
Body: "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]",
Date: "//meta[@name='citation_date']/@content",
Author: "//meta[@name='citation_author']/@content",
}
parser.AddRule("example.com", localhostRule)
contentURL := "https://example.com/metadata"
parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
if req.URL.String() == contentURL {
return htmlResponse(http.StatusOK, contentHTML), nil
}
return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
}))
content, err := parser.ParseURL(contentURL)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if content.Title != "Test Research Paper" {
t.Errorf("Expected title 'Test Research Paper', got %s", content.Title)
}
if content.Author != "Dr. Test Author" {
t.Errorf("Expected author 'Dr. Test Author', got %s", content.Author)
}
if content.Date != "2024-01-01" {
t.Errorf("Expected date '2024-01-01', got %s", content.Date)
}
mdPath, _, err := parser.SaveArticle(content, tempDir)
if err != nil {
t.Fatalf("Failed to save article: %v", err)
}
mdContent, err := os.ReadFile(mdPath)
if err != nil {
t.Fatalf("Failed to read markdown file: %v", err)
}
if !strings.Contains(string(mdContent), "**Author:** Dr. Test Author") {
t.Error("Expected markdown to contain author")
}
if !strings.Contains(string(mdContent), "**Date:** 2024-01-01") {
t.Error("Expected markdown to contain date")
}
article := &models.Article{
Author: content.Author,
Date: content.Date,
}
if article.Author != "Dr. Test Author" {
t.Errorf("Expected article author 'Dr. Test Author', got %s", article.Author)
}
if article.Date != "2024-01-01" {
t.Errorf("Expected article date '2024-01-01', got %s", article.Date)
}
})
}
type roundTripFunc func(*http.Request) (*http.Response, error)
func (f roundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) {
return f(req)
}
func newMockHTTPClient(t *testing.T, fn roundTripFunc) *http.Client {
t.Helper()
return &http.Client{Transport: fn}
}
func htmlResponse(status int, body string) *http.Response {
return &http.Response{
StatusCode: status,
Header: http.Header{"Content-Type": []string{"text/html; charset=utf-8"}},
Body: io.NopCloser(strings.NewReader(body)),
}
}
func withDefaultHTTPClient(t *testing.T, fn roundTripFunc) {
t.Helper()
original := http.DefaultClient.Transport
http.DefaultClient.Transport = fn
t.Cleanup(func() {
http.DefaultClient.Transport = original
})
}