internal/articles/parser_test.go at main · desertthunder.dev/noteleaf

cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
noteleaf / internal / articles / parser_test.go
at main 1048 lines 33 kB view raw
   1package articles
   2
   3import (
   4	"errors"
   5	"fmt"
   6	"io"
   7	"net/http"
   8	"os"
   9	"strings"
  10	"testing"
  11	"time"
  12
  13	"github.com/stormlightlabs/noteleaf/internal/models"
  14)
  15
  16// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
  17func ExampleParser_Convert() {
  18	parser, err := NewArticleParser(http.DefaultClient)
  19	if err != nil {
  20		fmt.Printf("Failed to create parser: %v\n", err)
  21		return
  22	}
  23
  24	htmlPath := "examples/christopher-lloyd.html"
  25	htmlContent, err := os.ReadFile(htmlPath)
  26	if err != nil {
  27		fmt.Printf("Local HTML file not found: %v\n", err)
  28		return
  29	}
  30
  31	markdown, err := parser.Convert(string(htmlContent), ".wikipedia.org", "https://en.wikipedia.org/wiki/Christopher_Lloyd")
  32	if err != nil {
  33		fmt.Printf("Failed to convert HTML: %v\n", err)
  34		return
  35	}
  36
  37	parts := strings.Split(markdown, "\n---\n")
  38	if len(parts) > 0 {
  39		frontmatter := strings.TrimSpace(parts[0])
  40		lines := strings.Split(frontmatter, "\n")
  41
  42		for i, line := range lines {
  43			if i >= 4 {
  44				break
  45			}
  46
  47			if !strings.Contains(line, "**Saved:**") {
  48				fmt.Println(line)
  49			}
  50		}
  51	}
  52
  53	// Output: # Christopher Lloyd
  54	//
  55	// **Author:** Contributors to Wikimedia projects
  56}
  57
  58func TestArticleParser(t *testing.T) {
  59	t.Run("New", func(t *testing.T) {
  60		t.Run("successfully creates parser", func(t *testing.T) {
  61			parser, err := NewArticleParser(http.DefaultClient)
  62			if err != nil {
  63				t.Fatalf("Expected no error, got %v", err)
  64			}
  65			if parser == nil {
  66				t.Fatal("Expected parser to be created, got nil")
  67			}
  68			if len(parser.rules) == 0 {
  69				t.Error("Expected rules to be loaded")
  70			}
  71		})
  72
  73		t.Run("loads expected domains", func(t *testing.T) {
  74			parser, err := NewArticleParser(http.DefaultClient)
  75			if err != nil {
  76				t.Fatalf("Failed to create parser: %v", err)
  77			}
  78
  79			domains := parser.GetSupportedDomains()
  80			expectedDomains := []string{".wikipedia.org", "arxiv.org", "baseballprospectus.com"}
  81
  82			if len(domains) != len(expectedDomains) {
  83				t.Errorf("Expected %d domains, got %d", len(expectedDomains), len(domains))
  84			}
  85
  86			domainMap := make(map[string]bool)
  87			for _, domain := range domains {
  88				domainMap[domain] = true
  89			}
  90
  91			for _, expected := range expectedDomains {
  92				if !domainMap[expected] {
  93					t.Errorf("Expected domain %s not found in supported domains", expected)
  94				}
  95			}
  96		})
  97	})
  98
  99	t.Run("parseRules", func(t *testing.T) {
 100		parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
 101
 102		t.Run("parses valid rule file", func(t *testing.T) {
 103			content := `title: //h1
 104author: //span[@class='author']
 105date: //time
 106body: //article
 107strip: //nav
 108strip: //footer
 109test_url: https://example.com/article`
 110
 111			rule, err := parser.parseRules("example.com", content)
 112			if err != nil {
 113				t.Fatalf("Expected no error, got %v", err)
 114			}
 115
 116			if rule.Domain != "example.com" {
 117				t.Errorf("Expected domain 'example.com', got %s", rule.Domain)
 118			}
 119			if rule.Title != "//h1" {
 120				t.Errorf("Expected title '//h1', got %s", rule.Title)
 121			}
 122			if rule.Author != "//span[@class='author']" {
 123				t.Errorf("Expected author '//span[@class='author']', got %s", rule.Author)
 124			}
 125			if len(rule.Strip) != 2 {
 126				t.Errorf("Expected 2 strip rules, got %d", len(rule.Strip))
 127			}
 128			if len(rule.TestURLs) != 1 {
 129				t.Errorf("Expected 1 test URL, got %d", len(rule.TestURLs))
 130			}
 131		})
 132
 133		t.Run("handles empty lines and comments", func(t *testing.T) {
 134			content := `# This is a comment
 135title: //h1
 136
 137# Another comment
 138body: //article
 139`
 140
 141			rule, err := parser.parseRules("test.com", content)
 142			if err != nil {
 143				t.Fatalf("Expected no error, got %v", err)
 144			}
 145
 146			if rule.Title != "//h1" {
 147				t.Errorf("Expected title '//h1', got %s", rule.Title)
 148			}
 149			if rule.Body != "//article" {
 150				t.Errorf("Expected body '//article', got %s", rule.Body)
 151			}
 152		})
 153	})
 154
 155	t.Run("slugify", func(t *testing.T) {
 156		parser := &ArticleParser{}
 157
 158		tc := []struct {
 159			input    string
 160			expected string
 161		}{
 162			{"Simple Title", "simple-title"},
 163			{"Title with Numbers 123", "title-with-numbers-123"},
 164			{"Title-with-Hyphens", "title-with-hyphens"},
 165			{"Title with Spaces and    Multiple   Spaces", "title-with-spaces-and-multiple-spaces"},
 166			{"Title!@#$%^&*()with Special Characters", "title-with-special-characters"},
 167			{"", ""},
 168			{strings.Repeat("a", 150), strings.Repeat("a", 100)},
 169		}
 170
 171		for _, tt := range tc {
 172			t.Run(fmt.Sprintf("slugify '%s'", tt.input), func(t *testing.T) {
 173				result := parser.slugify(tt.input)
 174				if result != tt.expected {
 175					t.Errorf("Expected '%s', got '%s'", tt.expected, result)
 176				}
 177			})
 178		}
 179	})
 180
 181	t.Run("Convert", func(t *testing.T) {
 182		parser, err := NewArticleParser(http.DefaultClient)
 183		if err != nil {
 184			t.Fatalf("Failed to create parser: %v", err)
 185		}
 186
 187		t.Run("fails with unsupported domain", func(t *testing.T) {
 188			htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
 189			_, err := parser.Convert(htmlContent, "unsupported.com", "https://unsupported.com/article")
 190
 191			if err == nil {
 192				t.Error("Expected error for unsupported domain")
 193			}
 194
 195			if !strings.Contains(err.Error(), "confidence too low") &&
 196				!strings.Contains(err.Error(), "could not extract title") {
 197				t.Errorf("Expected heuristic extraction error, got %v", err)
 198			}
 199		})
 200
 201		t.Run("fails with invalid HTML", func(t *testing.T) {
 202			invalidHTML := "<html><head><title>Test</head></body>"
 203			_, err := parser.Convert(invalidHTML, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
 204
 205			if err == nil {
 206				t.Error("Expected error for invalid HTML")
 207			}
 208		})
 209
 210		t.Run("fails when no title extracted", func(t *testing.T) {
 211			htmlContent := "<html><head><title>Test</title></head><body><p>Content</p></body></html>"
 212			_, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
 213
 214			if err == nil {
 215				t.Error("Expected error when no title can be extracted")
 216			}
 217
 218			if !strings.Contains(err.Error(), "could not extract title") &&
 219				!strings.Contains(err.Error(), "could not extract body content") &&
 220				!strings.Contains(err.Error(), "confidence too low") {
 221				t.Errorf("Expected title, body, or confidence error, got %v", err)
 222			}
 223		})
 224
 225		t.Run("successfully converts valid Wikipedia HTML", func(t *testing.T) {
 226			htmlContent := `<html>
 227			<head><title>Test Article</title></head>
 228			<body>
 229				<h1 id="firstHeading">Test Article Title</h1>
 230				<div id="bodyContent">
 231					<style>.mw-parser-output .hatnote{font-style:italic;}</style>
 232					<p>This is the main content of the article.</p>
 233					<div class="noprint">This should be stripped</div>
 234					<div class="editsection">Edit this section</div>
 235					<p>More content here.</p>
 236				</div>
 237			</body>
 238		</html>`
 239
 240			markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test")
 241			if err != nil {
 242				t.Fatalf("Expected no error, got %v", err)
 243			}
 244
 245			if !strings.Contains(markdown, "# Test Article Title") {
 246				t.Error("Expected markdown to contain title")
 247			}
 248			if !strings.Contains(markdown, "**Source:** https://en.wikipedia.org/wiki/Test") {
 249				t.Error("Expected markdown to contain source URL")
 250			}
 251			if !strings.Contains(markdown, "This is the main content") {
 252				t.Error("Expected markdown to contain article content")
 253			}
 254			if strings.Contains(markdown, "This should be stripped") {
 255				t.Error("Expected stripped content to be removed from markdown")
 256			}
 257			if strings.Contains(markdown, ".mw-parser-output") {
 258				t.Error("Expected style content to be removed from markdown")
 259			}
 260			if strings.Contains(markdown, "Edit this section") {
 261				t.Error("Expected edit section markers to be removed from markdown")
 262			}
 263		})
 264
 265		t.Run("strips Wikipedia navigation boxes and metadata", func(t *testing.T) {
 266			htmlContent := `<html>
 267			<head><title>Test Navigation Article</title></head>
 268			<body>
 269				<h1 id="firstHeading">Test Navigation Article</h1>
 270				<div id="bodyContent">
 271					<p>Main article content goes here.</p>
 272					<h2>Section One<span class="mw-editsection">[edit]</span></h2>
 273					<p>Section content.</p>
 274					<table class="navbox" role="navigation">
 275						<tr><td>Navigation item 1</td></tr>
 276						<tr><td>Navigation item 2</td></tr>
 277					</table>
 278					<div class="navbox">
 279						<p>Another navigation box</p>
 280					</div>
 281					<table class="vertical-navbox">
 282						<tr><td>Vertical nav item</td></tr>
 283					</table>
 284					<p>More article content.</p>
 285					<div role="navigation">
 286						<p>Navigation content</p>
 287					</div>
 288					<div id="catlinks">
 289						<p>Categories: Test Category</p>
 290					</div>
 291					<div id="footer">
 292						<p>Retrieved from Wikipedia</p>
 293					</div>
 294				</div>
 295			</body>
 296		</html>`
 297
 298			markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test_Navigation")
 299			if err != nil {
 300				t.Fatalf("Expected no error, got %v", err)
 301			}
 302
 303			if !strings.Contains(markdown, "Main article content") {
 304				t.Error("Expected markdown to contain main article content")
 305			}
 306			if !strings.Contains(markdown, "Section content") {
 307				t.Error("Expected markdown to contain section content")
 308			}
 309			if !strings.Contains(markdown, "More article content") {
 310				t.Error("Expected markdown to contain additional content")
 311			}
 312
 313			if strings.Contains(markdown, "Navigation item") {
 314				t.Error("Expected navbox table content to be stripped")
 315			}
 316			if strings.Contains(markdown, "Another navigation box") {
 317				t.Error("Expected navbox div content to be stripped")
 318			}
 319			if strings.Contains(markdown, "Vertical nav item") {
 320				t.Error("Expected vertical-navbox content to be stripped")
 321			}
 322			if strings.Contains(markdown, "[edit]") {
 323				t.Error("Expected edit section markers to be stripped")
 324			}
 325			if strings.Contains(markdown, "Navigation content") {
 326				t.Error("Expected role=navigation content to be stripped")
 327			}
 328			if strings.Contains(markdown, "Categories:") {
 329				t.Error("Expected category links to be stripped")
 330			}
 331			if strings.Contains(markdown, "Retrieved from") {
 332				t.Error("Expected footer content to be stripped")
 333			}
 334		})
 335
 336		t.Run("uses heuristic extraction for unsupported domain with semantic HTML", func(t *testing.T) {
 337			htmlContent := `<html><head>
 338				<title>Heuristic Test Article</title>
 339				<meta property="og:author" content="Heuristic Author">
 340				<meta property="article:published_time" content="2025-01-15">
 341			</head><body>
 342				<article>
 343					<p>This is a substantial article that should be extracted using heuristic methods.</p>
 344					<p>It contains multiple paragraphs with sufficient content for the readability algorithm.</p>
 345					<p>The heuristic extractor should successfully identify this as main content.</p>
 346				</article>
 347			</body></html>`
 348
 349			markdown, err := parser.Convert(htmlContent, "unsupported-domain.com", "https://unsupported-domain.com/article")
 350
 351			if err == nil {
 352				if !strings.Contains(markdown, "substantial article") {
 353					t.Error("Expected markdown to contain extracted content")
 354				}
 355			}
 356		})
 357
 358		t.Run("includes confidence score in parsed content", func(t *testing.T) {
 359			htmlContent := `<html>
 360			<head><title>Confidence Test</title></head>
 361			<body>
 362				<h1 id="firstHeading">Confidence Test Article</h1>
 363				<div id="bodyContent">
 364					<p>Article content for confidence testing.</p>
 365				</div>
 366			</body>
 367		</html>`
 368
 369			content, err := parser.Parse(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Confidence")
 370			if err != nil {
 371				t.Fatalf("Expected no error, got %v", err)
 372			}
 373
 374			if content.Confidence == 0.0 {
 375				t.Error("Expected non-zero confidence score")
 376			}
 377
 378			if content.ExtractionMethod == "" {
 379				t.Error("Expected extraction method to be set")
 380			}
 381		})
 382
 383		t.Run("falls back to metadata extractor when XPath fails", func(t *testing.T) {
 384			htmlContent := `<html><head>
 385				<title>Metadata Fallback Test</title>
 386				<meta property="og:author" content="Metadata Author">
 387				<meta property="article:published_time" content="2025-01-20">
 388			</head><body>
 389				<h1 id="firstHeading">Fallback Test</h1>
 390				<div id="bodyContent">
 391					<p>Content without author or date in XPath locations.</p>
 392				</div>
 393			</body></html>`
 394
 395			content, err := parser.Parse(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Metadata_Test")
 396			if err != nil {
 397				t.Fatalf("Expected no error, got %v", err)
 398			}
 399
 400			if content.Author != "Metadata Author" {
 401				t.Errorf("Expected metadata fallback for author, got %q", content.Author)
 402			}
 403
 404			if content.Date != "2025-01-20" {
 405				t.Errorf("Expected metadata fallback for date, got %q", content.Date)
 406			}
 407		})
 408	})
 409
 410	t.Run("ParseURL", func(t *testing.T) {
 411		parser, err := NewArticleParser(http.DefaultClient)
 412		if err != nil {
 413			t.Fatalf("Failed to create parser: %v", err)
 414		}
 415
 416		localhostRule := &ParsingRule{
 417			Domain: "example.com",
 418			Title:  "//h1[@id='firstHeading']",
 419			Body:   "//div[@id='bodyContent']",
 420			Strip:  []string{"//div[@class='noprint']"},
 421		}
 422		parser.AddRule("example.com", localhostRule)
 423
 424		const (
 425			validURL       = "https://example.com/wiki/test"
 426			httpErrorURL   = "https://example.com/wiki/404"
 427			unsupportedURL = "https://unsupported-domain.test/article"
 428		)
 429
 430		parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
 431			switch req.URL.String() {
 432			case validURL:
 433				return htmlResponse(http.StatusOK, `<html>
 434					<head><title>Test Article</title></head>
 435					<body>
 436						<h1 id="firstHeading">Test Wikipedia Article</h1>
 437						<div id="bodyContent">
 438							<p>This is the article content.</p>
 439							<div class="noprint">This gets stripped</div>
 440						</div>
 441					</body>
 442				</html>`), nil
 443			case httpErrorURL:
 444				return &http.Response{
 445					StatusCode: http.StatusNotFound,
 446					Header:     make(http.Header),
 447					Body:       io.NopCloser(strings.NewReader("")),
 448				}, nil
 449			case unsupportedURL:
 450				return htmlResponse(http.StatusOK, `<html><head><title>Unsupported</title></head><body><p>Content</p></body></html>`), nil
 451			default:
 452				return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
 453			}
 454		}))
 455
 456		t.Run("fails with invalid URL", func(t *testing.T) {
 457			_, err := parser.ParseURL("not-a-url")
 458			if err == nil {
 459				t.Error("Expected error for invalid URL")
 460			}
 461			if !strings.Contains(err.Error(), "unsupported protocol scheme") &&
 462				!strings.Contains(err.Error(), "failed to fetch URL") &&
 463				!strings.Contains(err.Error(), "invalid URL") {
 464				t.Errorf("Expected URL scheme error, got %v", err)
 465			}
 466		})
 467
 468		t.Run("fails with unsupported domain", func(t *testing.T) {
 469			_, err := parser.ParseURL(unsupportedURL)
 470			if err == nil {
 471				t.Error("Expected error for unsupported domain")
 472			}
 473		})
 474
 475		t.Run("fails with HTTP error", func(t *testing.T) {
 476			_, err := parser.ParseURL(httpErrorURL)
 477			if err == nil {
 478				t.Error("Expected error for HTTP 404")
 479			}
 480		})
 481
 482		t.Run("successfully parses supported domain", func(t *testing.T) {
 483			content, err := parser.ParseURL(validURL)
 484			if err != nil {
 485				t.Fatalf("Expected no error, got %v", err)
 486			}
 487			if content == nil {
 488				t.Fatal("Expected parsed content, got nil")
 489			}
 490			if content.Title != "Test Wikipedia Article" {
 491				t.Errorf("Expected title to be extracted, got %q", content.Title)
 492			}
 493			if !strings.Contains(content.Content, "This is the article content.") {
 494				t.Errorf("Expected content to include article text, got %q", content.Content)
 495			}
 496			if strings.Contains(content.Content, "This gets stripped") {
 497				t.Error("Expected strip rules to remove non-content nodes")
 498			}
 499		})
 500
 501	})
 502
 503	t.Run("SaveArticle", func(t *testing.T) {
 504		parser := &ArticleParser{}
 505		tempDir := t.TempDir()
 506
 507		content := &ParsedContent{
 508			Title:   "Test Article",
 509			Author:  "Test Author",
 510			Date:    "2023-01-01",
 511			Content: "This is test content.",
 512			URL:     "https://example.com/test",
 513		}
 514
 515		t.Run("successfully saves article", func(t *testing.T) {
 516			mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
 517			if err != nil {
 518				t.Fatalf("Expected no error, got %v", err)
 519			}
 520
 521			if _, err := os.Stat(mdPath); os.IsNotExist(err) {
 522				t.Error("Expected markdown file to exist")
 523			}
 524			if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
 525				t.Error("Expected HTML file to exist")
 526			}
 527
 528			mdContent, err := os.ReadFile(mdPath)
 529			if err != nil {
 530				t.Fatalf("Failed to read markdown file: %v", err)
 531			}
 532			if !strings.Contains(string(mdContent), "# Test Article") {
 533				t.Error("Expected markdown to contain title")
 534			}
 535			if !strings.Contains(string(mdContent), "**Author:** Test Author") {
 536				t.Error("Expected markdown to contain author")
 537			}
 538
 539			htmlContentBytes, err := os.ReadFile(htmlPath)
 540			if err != nil {
 541				t.Fatalf("Failed to read HTML file: %v", err)
 542			}
 543			if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") {
 544				t.Error("Expected HTML to contain title")
 545			}
 546		})
 547
 548		t.Run("handles duplicate filenames", func(t *testing.T) {
 549			mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir)
 550			if err != nil {
 551				t.Fatalf("Expected no error for first save, got %v", err)
 552			}
 553
 554			mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir)
 555			if err != nil {
 556				t.Fatalf("Expected no error for second save, got %v", err)
 557			}
 558
 559			if mdPath1 == mdPath2 {
 560				t.Error("Expected different markdown paths for duplicate saves")
 561			}
 562			if htmlPath1 == htmlPath2 {
 563				t.Error("Expected different HTML paths for duplicate saves")
 564			}
 565
 566			if _, err := os.Stat(mdPath1); os.IsNotExist(err) {
 567				t.Error("Expected first markdown file to exist")
 568			}
 569			if _, err := os.Stat(mdPath2); os.IsNotExist(err) {
 570				t.Error("Expected second markdown file to exist")
 571			}
 572		})
 573
 574		t.Run("fails with invalid directory", func(t *testing.T) {
 575			invalidDir := "/nonexistent/directory"
 576			_, _, err := parser.SaveArticle(content, invalidDir)
 577			if err == nil {
 578				t.Error("Expected error for invalid directory")
 579			}
 580		})
 581	})
 582
 583	t.Run("createHTML", func(t *testing.T) {
 584		parser := &ArticleParser{}
 585		content := &ParsedContent{
 586			Title:   "Test HTML Article",
 587			Author:  "HTML Author",
 588			Date:    "2023-12-25",
 589			Content: "This is **bold** content with *emphasis*.",
 590			URL:     "https://example.com/html-test",
 591		}
 592
 593		t.Run("creates valid HTML", func(t *testing.T) {
 594			markdown := parser.createMarkdown(content)
 595			html := parser.createHTML(content, markdown)
 596
 597			if !strings.Contains(html, "<!DOCTYPE html>") {
 598				t.Error("Expected HTML to contain DOCTYPE")
 599			}
 600			if !strings.Contains(html, "<title>Test HTML Article</title>") {
 601				t.Error("Expected HTML to contain title")
 602			}
 603			if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") {
 604				t.Error("Expected HTML to contain h1 heading with title")
 605			}
 606			if !strings.Contains(html, "<strong>bold</strong>") {
 607				t.Error("Expected HTML to contain bold formatting")
 608			}
 609			if !strings.Contains(html, "<em>emphasis</em>") {
 610				t.Error("Expected HTML to contain emphasis formatting")
 611			}
 612		})
 613	})
 614
 615	t.Run("createMarkdown", func(t *testing.T) {
 616		parser := &ArticleParser{}
 617
 618		t.Run("creates markdown with all fields", func(t *testing.T) {
 619			content := &ParsedContent{
 620				Title:   "Full Content Article",
 621				Author:  "Complete Author",
 622				Date:    "2023-01-15",
 623				Content: "Complete article content here.",
 624				URL:     "https://example.com/full",
 625			}
 626
 627			markdown := parser.createMarkdown(content)
 628
 629			if !strings.Contains(markdown, "# Full Content Article") {
 630				t.Error("Expected markdown to contain title")
 631			}
 632			if !strings.Contains(markdown, "**Author:** Complete Author") {
 633				t.Error("Expected markdown to contain author")
 634			}
 635			if !strings.Contains(markdown, "**Date:** 2023-01-15") {
 636				t.Error("Expected markdown to contain date")
 637			}
 638			if !strings.Contains(markdown, "**Source:** https://example.com/full") {
 639				t.Error("Expected markdown to contain source URL")
 640			}
 641			if !strings.Contains(markdown, "**Saved:**") {
 642				t.Error("Expected markdown to contain saved timestamp")
 643			}
 644			if !strings.Contains(markdown, "---") {
 645				t.Error("Expected markdown to contain separator")
 646			}
 647			if !strings.Contains(markdown, "Complete article content here.") {
 648				t.Error("Expected markdown to contain article content")
 649			}
 650		})
 651
 652		t.Run("creates markdown with minimal fields", func(t *testing.T) {
 653			content := &ParsedContent{
 654				Title:   "Minimal Article",
 655				Content: "Just content.",
 656				URL:     "https://example.com/minimal",
 657			}
 658
 659			markdown := parser.createMarkdown(content)
 660
 661			if !strings.Contains(markdown, "# Minimal Article") {
 662				t.Error("Expected markdown to contain title")
 663			}
 664			if strings.Contains(markdown, "**Author:**") {
 665				t.Error("Expected no author field for empty author")
 666			}
 667			if strings.Contains(markdown, "**Date:**") {
 668				t.Error("Expected no date field for empty date")
 669			}
 670			if !strings.Contains(markdown, "**Source:** https://example.com/minimal") {
 671				t.Error("Expected markdown to contain source URL")
 672			}
 673		})
 674	})
 675}
 676
 677func TestCreateArticleFromURL(t *testing.T) {
 678	tempDir := t.TempDir()
 679
 680	t.Run("fails with invalid URL", func(t *testing.T) {
 681		_, err := CreateArticleFromURL("not-a-url", tempDir)
 682		if err == nil {
 683			t.Error("Expected error for invalid URL")
 684		}
 685		if !strings.Contains(err.Error(), "invalid URL") && !strings.Contains(err.Error(), "failed to parse URL") {
 686			t.Errorf("Expected URL parsing error, got %v", err)
 687		}
 688	})
 689
 690	t.Run("fails with empty URL", func(t *testing.T) {
 691		_, err := CreateArticleFromURL("", tempDir)
 692		if err == nil {
 693			t.Error("Expected error for empty URL")
 694		}
 695	})
 696
 697	t.Run("fails with unsupported domain", func(t *testing.T) {
 698		unsupportedURL := "https://unsupported-domain.test/article"
 699		withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) {
 700			if req.URL.String() == unsupportedURL {
 701				return htmlResponse(http.StatusOK, "<html><body><div>Too little content</div></body></html>"), nil
 702			}
 703			return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
 704		})
 705
 706		_, err := CreateArticleFromURL(unsupportedURL, tempDir)
 707		if err == nil {
 708			t.Error("Expected error for unsupported domain")
 709		}
 710		if !strings.Contains(err.Error(), "confidence too low") &&
 711			!strings.Contains(err.Error(), "could not extract title") {
 712			t.Errorf("Expected heuristic extraction error, got %v", err)
 713		}
 714	})
 715
 716	t.Run("fails with HTTP error", func(t *testing.T) {
 717		errorURL := "https://example.com/missing"
 718		withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) {
 719			if req.URL.String() == errorURL {
 720				return &http.Response{
 721					StatusCode: http.StatusNotFound,
 722					Header:     make(http.Header),
 723					Body:       io.NopCloser(strings.NewReader("")),
 724				}, nil
 725			}
 726			return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
 727		})
 728
 729		_, err := CreateArticleFromURL(errorURL, tempDir)
 730		if err == nil {
 731			t.Error("Expected error for HTTP 404")
 732		}
 733		if !strings.Contains(err.Error(), "HTTP error") && !strings.Contains(err.Error(), "404") {
 734			t.Errorf("Expected HTTP error, got %v", err)
 735		}
 736	})
 737
 738	t.Run("fails with network error", func(t *testing.T) {
 739		networkURL := "https://example.com/network"
 740		withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) {
 741			if req.URL.String() == networkURL {
 742				return nil, errors.New("dial error")
 743			}
 744			return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
 745		})
 746
 747		_, err := CreateArticleFromURL(networkURL, tempDir)
 748		if err == nil {
 749			t.Error("Expected error for network failure")
 750		}
 751		if !strings.Contains(err.Error(), "failed to fetch URL") && !strings.Contains(err.Error(), "connection refused") {
 752			t.Errorf("Expected network error, got %v", err)
 753		}
 754	})
 755
 756	t.Run("fails with malformed HTML", func(t *testing.T) {
 757		parser, err := NewArticleParser(http.DefaultClient)
 758		if err != nil {
 759			t.Fatalf("Failed to create parser: %v", err)
 760		}
 761
 762		localhostRule := &ParsingRule{
 763			Domain: "example.com",
 764			Title:  "//h1[@id='firstHeading']",
 765			Body:   "//div[@id='bodyContent']",
 766			Strip:  []string{"//div[@class='noprint']"},
 767		}
 768		parser.AddRule("example.com", localhostRule)
 769
 770		malformedURL := "https://example.com/malformed"
 771		parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
 772			if req.URL.String() == malformedURL {
 773				return htmlResponse(http.StatusOK, "<html><head><title>Test</head></body>"), nil
 774			}
 775			return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
 776		}))
 777
 778		_, err = parser.ParseURL(malformedURL)
 779		if err == nil {
 780			t.Error("Expected error for malformed HTML")
 781		}
 782
 783		if !strings.Contains(err.Error(), "failed to parse HTML") &&
 784			!strings.Contains(err.Error(), "could not extract title") &&
 785			!strings.Contains(err.Error(), "could not extract body content") &&
 786			!strings.Contains(err.Error(), "confidence too low") {
 787			t.Errorf("Expected HTML parsing or extraction error, got %v", err)
 788		}
 789	})
 790
 791	t.Run("fails when no title can be extracted", func(t *testing.T) {
 792		parser, err := NewArticleParser(http.DefaultClient)
 793		if err != nil {
 794			t.Fatalf("Failed to create parser: %v", err)
 795		}
 796
 797		localhostRule := &ParsingRule{
 798			Domain: "example.com",
 799			Title:  "//h1[@id='firstHeading']",
 800			Body:   "//div[@id='bodyContent']",
 801			Strip:  []string{"//div[@class='noprint']"},
 802		}
 803		parser.AddRule("example.com", localhostRule)
 804
 805		noTitleURL := "https://example.com/notitle"
 806		parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
 807			if req.URL.String() == noTitleURL {
 808				return htmlResponse(http.StatusOK, `<html>
 809					<head><title>Test</title></head>
 810					<body>
 811						<div id="bodyContent">
 812							<p>Content without proper title</p>
 813						</div>
 814					</body>
 815				</html>`), nil
 816			}
 817			return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
 818		}))
 819
 820		result, err := parser.ParseURL(noTitleURL)
 821
 822		if err != nil {
 823			if !strings.Contains(err.Error(), "could not extract title") &&
 824				!strings.Contains(err.Error(), "confidence too low") {
 825				t.Errorf("Expected title extraction error, got %v", err)
 826			}
 827		} else if result != nil {
 828			if result.Title == "" {
 829				t.Error("Expected title to be extracted via metadata fallback")
 830			}
 831		}
 832	})
 833
 834	t.Run("successfully creates article structure from parsed content", func(t *testing.T) {
 835		wikipediaHTML := `<html>
 836			<head><title>Integration Test Article</title></head>
 837			<body>
 838				<h1 id="firstHeading">Integration Test Article</h1>
 839				<div id="bodyContent">
 840					<p>This is integration test content.</p>
 841					<div class="noprint">This should be stripped</div>
 842					<p>More content here.</p>
 843				</div>
 844			</body>
 845		</html>`
 846
 847		parser, err := NewArticleParser(http.DefaultClient)
 848		if err != nil {
 849			t.Fatalf("Failed to create parser: %v", err)
 850		}
 851
 852		localhostRule := &ParsingRule{
 853			Domain: "example.com",
 854			Title:  "//h1[@id='firstHeading']",
 855			Body:   "//div[@id='bodyContent']",
 856			Strip:  []string{"//div[@class='noprint']"},
 857		}
 858		parser.AddRule("example.com", localhostRule)
 859
 860		contentURL := "https://example.com/integration"
 861		parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
 862			if req.URL.String() == contentURL {
 863				return htmlResponse(http.StatusOK, wikipediaHTML), nil
 864			}
 865			return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
 866		}))
 867
 868		content, err := parser.ParseURL(contentURL)
 869		if err != nil {
 870			t.Fatalf("Expected no error, got %v", err)
 871		}
 872
 873		mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
 874		if err != nil {
 875			t.Fatalf("Failed to save article: %v", err)
 876		}
 877
 878		article := &models.Article{
 879			URL:          contentURL,
 880			Title:        content.Title,
 881			MarkdownPath: mdPath,
 882			HTMLPath:     htmlPath,
 883			Created:      time.Now(),
 884			Modified:     time.Now(),
 885		}
 886
 887		if article.Title != "Integration Test Article" {
 888			t.Errorf("Expected title 'Integration Test Article', got %s", article.Title)
 889		}
 890		if article.URL != contentURL {
 891			t.Errorf("Expected URL %s, got %s", contentURL, article.URL)
 892		}
 893		if article.MarkdownPath == "" {
 894			t.Error("Expected non-empty markdown path")
 895		}
 896		if article.HTMLPath == "" {
 897			t.Error("Expected non-empty HTML path")
 898		}
 899		if article.Created.IsZero() {
 900			t.Error("Expected Created timestamp to be set")
 901		}
 902		if article.Modified.IsZero() {
 903			t.Error("Expected Modified timestamp to be set")
 904		}
 905
 906		if _, err := os.Stat(article.MarkdownPath); os.IsNotExist(err) {
 907			t.Error("Expected markdown file to exist")
 908		}
 909		if _, err := os.Stat(article.HTMLPath); os.IsNotExist(err) {
 910			t.Error("Expected HTML file to exist")
 911		}
 912
 913		mdContent, err := os.ReadFile(article.MarkdownPath)
 914		if err != nil {
 915			t.Fatalf("Failed to read markdown file: %v", err)
 916		}
 917		if !strings.Contains(string(mdContent), "# Integration Test Article") {
 918			t.Error("Expected markdown to contain title")
 919		}
 920		if !strings.Contains(string(mdContent), "This is integration test content") {
 921			t.Error("Expected markdown to contain article content")
 922		}
 923		if strings.Contains(string(mdContent), "This should be stripped") {
 924			t.Error("Expected stripped content to be removed from markdown")
 925		}
 926
 927		htmlContent, err := os.ReadFile(article.HTMLPath)
 928		if err != nil {
 929			t.Fatalf("Failed to read HTML file: %v", err)
 930		}
 931		if !strings.Contains(string(htmlContent), "<title>Integration Test Article</title>") {
 932			t.Error("Expected HTML to contain title")
 933		}
 934		if !strings.Contains(string(htmlContent), "<!DOCTYPE html>") {
 935			t.Error("Expected HTML to contain DOCTYPE")
 936		}
 937	})
 938
 939	t.Run("successfully handles article with metadata", func(t *testing.T) {
 940		contentHTML := `<html>
 941			<head>
 942				<title>Test Paper</title>
 943				<meta name="citation_author" content="Dr. Test Author">
 944				<meta name="citation_date" content="2024-01-01">
 945			</head>
 946			<body>
 947				<h1 class="title">Test Research Paper</h1>
 948				<blockquote class="abstract">
 949					<p>This is the abstract of the research paper.</p>
 950					<p>It contains important research findings.</p>
 951				</blockquote>
 952			</body>
 953		</html>`
 954
 955		parser, err := NewArticleParser(http.DefaultClient)
 956		if err != nil {
 957			t.Fatalf("Failed to create parser: %v", err)
 958		}
 959
 960		localhostRule := &ParsingRule{
 961			Domain: "example.com",
 962			Title:  "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]",
 963			Body:   "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]",
 964			Date:   "//meta[@name='citation_date']/@content",
 965			Author: "//meta[@name='citation_author']/@content",
 966		}
 967		parser.AddRule("example.com", localhostRule)
 968
 969		contentURL := "https://example.com/metadata"
 970		parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) {
 971			if req.URL.String() == contentURL {
 972				return htmlResponse(http.StatusOK, contentHTML), nil
 973			}
 974			return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
 975		}))
 976
 977		content, err := parser.ParseURL(contentURL)
 978		if err != nil {
 979			t.Fatalf("Expected no error, got %v", err)
 980		}
 981
 982		if content.Title != "Test Research Paper" {
 983			t.Errorf("Expected title 'Test Research Paper', got %s", content.Title)
 984		}
 985		if content.Author != "Dr. Test Author" {
 986			t.Errorf("Expected author 'Dr. Test Author', got %s", content.Author)
 987		}
 988		if content.Date != "2024-01-01" {
 989			t.Errorf("Expected date '2024-01-01', got %s", content.Date)
 990		}
 991
 992		mdPath, _, err := parser.SaveArticle(content, tempDir)
 993		if err != nil {
 994			t.Fatalf("Failed to save article: %v", err)
 995		}
 996
 997		mdContent, err := os.ReadFile(mdPath)
 998		if err != nil {
 999			t.Fatalf("Failed to read markdown file: %v", err)
1000		}
1001		if !strings.Contains(string(mdContent), "**Author:** Dr. Test Author") {
1002			t.Error("Expected markdown to contain author")
1003		}
1004		if !strings.Contains(string(mdContent), "**Date:** 2024-01-01") {
1005			t.Error("Expected markdown to contain date")
1006		}
1007
1008		article := &models.Article{
1009			Author: content.Author,
1010			Date:   content.Date,
1011		}
1012
1013		if article.Author != "Dr. Test Author" {
1014			t.Errorf("Expected article author 'Dr. Test Author', got %s", article.Author)
1015		}
1016		if article.Date != "2024-01-01" {
1017			t.Errorf("Expected article date '2024-01-01', got %s", article.Date)
1018		}
1019	})
1020}
1021
1022type roundTripFunc func(*http.Request) (*http.Response, error)
1023
1024func (f roundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) {
1025	return f(req)
1026}
1027
1028func newMockHTTPClient(t *testing.T, fn roundTripFunc) *http.Client {
1029	t.Helper()
1030	return &http.Client{Transport: fn}
1031}
1032
1033func htmlResponse(status int, body string) *http.Response {
1034	return &http.Response{
1035		StatusCode: status,
1036		Header:     http.Header{"Content-Type": []string{"text/html; charset=utf-8"}},
1037		Body:       io.NopCloser(strings.NewReader(body)),
1038	}
1039}
1040
1041func withDefaultHTTPClient(t *testing.T, fn roundTripFunc) {
1042	t.Helper()
1043	original := http.DefaultClient.Transport
1044	http.DefaultClient.Transport = fn
1045	t.Cleanup(func() {
1046		http.DefaultClient.Transport = original
1047	})
1048}