commit fad9301838ebc57804d86b5a3d5d8050ea398fbd · desertthunder.dev/noteleaf

desertthunder.dev / noteleaf

cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃

charm leaflet readability golang

fork atom

feat: article parser

desertthunder.dev 4 months ago fad93018 51e8ac11

+354 -21

2 changed files

expand all

unified split

internal

articles

articles.go

articles_test.go

+22 -14

internal/articles/articles.go

··· 34 35 // ParsingRule represents XPath rules for extracting content from a specific domain 36 type ParsingRule struct { 37 - Domain string 38 - Title string 39 - Author string 40 - Date string 41 - Body string 42 - Strip []string // XPath selectors for elements to remove 43 TestURLs []string 44 } 45 ··· 57 58 // ArticleParser implements the Parser interface 59 type ArticleParser struct { 60 - rules map[string]*ParsingRule 61 } 62 63 - // NewArticleParser creates a new ArticleParser with loaded rules 64 - func NewArticleParser() (*ArticleParser, error) { 65 parser := &ArticleParser{ 66 - rules: make(map[string]*ParsingRule), 67 } 68 69 if err := parser.loadRules(); err != nil { ··· 73 return parser, nil 74 } 75 76 func (p *ArticleParser) loadRules() error { 77 entries, err := rulesFS.ReadDir("rules") 78 if err != nil { ··· 91 return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err) 92 } 93 94 - rule, err := p.parseRuleFile(domain, string(content)) 95 if err != nil { 96 return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err) 97 } ··· 102 return nil 103 } 104 105 - func (p *ArticleParser) parseRuleFile(domain, content string) (*ParsingRule, error) { 106 rule := &ParsingRule{Domain: domain, Strip: []string{}} 107 scanner := bufio.NewScanner(strings.NewReader(content)) 108 for scanner.Scan() { ··· 152 153 domain := parsedURL.Hostname() 154 155 - resp, err := http.Get(urlStr) 156 if err != nil { 157 return nil, fmt.Errorf("failed to fetch URL: %w", err) 158 } ··· 362 363 // CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article] 364 func CreateArticleFromURL(url, dir string) (*models.Article, error) { 365 - parser, err := NewArticleParser() 366 if err != nil { 367 return nil, fmt.Errorf("failed to create parser: %w", err) 368 }

··· 34 35 // ParsingRule represents XPath rules for extracting content from a specific domain 36 type ParsingRule struct { 37 + Domain string 38 + Title string 39 + Author string 40 + Date string 41 + Body string 42 + // XPath selectors for elements to remove 43 + Strip []string 44 TestURLs []string 45 } 46 ··· 58 59 // ArticleParser implements the Parser interface 60 type ArticleParser struct { 61 + rules map[string]*ParsingRule 62 + client *http.Client 63 } 64 65 + // NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules 66 + func NewArticleParser(client *http.Client) (*ArticleParser, error) { 67 parser := &ArticleParser{ 68 + rules: make(map[string]*ParsingRule), 69 + client: client, 70 } 71 72 if err := parser.loadRules(); err != nil { ··· 76 return parser, nil 77 } 78 79 + // AddRule adds or replaces a parsing rule for a specific domain 80 + func (p *ArticleParser) AddRule(domain string, rule *ParsingRule) { 81 + p.rules[domain] = rule 82 + } 83 + 84 func (p *ArticleParser) loadRules() error { 85 entries, err := rulesFS.ReadDir("rules") 86 if err != nil { ··· 99 return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err) 100 } 101 102 + rule, err := p.parseRules(domain, string(content)) 103 if err != nil { 104 return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err) 105 } ··· 110 return nil 111 } 112 113 + func (p *ArticleParser) parseRules(domain, content string) (*ParsingRule, error) { 114 rule := &ParsingRule{Domain: domain, Strip: []string{}} 115 scanner := bufio.NewScanner(strings.NewReader(content)) 116 for scanner.Scan() { ··· 160 161 domain := parsedURL.Hostname() 162 163 + resp, err := p.client.Get(urlStr) 164 if err != nil { 165 return nil, fmt.Errorf("failed to fetch URL: %w", err) 166 } ··· 370 371 // CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article] 372 func CreateArticleFromURL(url, dir string) (*models.Article, error) { 373 + parser, err := NewArticleParser(http.DefaultClient) 374 if err != nil { 375 return nil, fmt.Errorf("failed to create parser: %w", err) 376 }

+332 -7

internal/articles/articles_test.go

··· 2 3 import ( 4 "fmt" 5 "os" 6 "strings" 7 "testing" ··· 9 10 // ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules. 11 func ExampleParser_Convert() { 12 - parser, err := NewArticleParser() 13 if err != nil { 14 fmt.Printf("Failed to create parser: %v\n", err) 15 return ··· 52 func TestArticleParser(t *testing.T) { 53 t.Run("New", func(t *testing.T) { 54 t.Run("successfully creates parser", func(t *testing.T) { 55 - parser, err := NewArticleParser() 56 if err != nil { 57 t.Fatalf("Expected no error, got %v", err) 58 } ··· 65 }) 66 67 t.Run("loads expected domains", func(t *testing.T) { 68 - parser, err := NewArticleParser() 69 if err != nil { 70 t.Fatalf("Failed to create parser: %v", err) 71 } ··· 90 }) 91 }) 92 93 - t.Run("parseRuleFile", func(t *testing.T) { 94 parser := &ArticleParser{rules: make(map[string]*ParsingRule)} 95 96 t.Run("parses valid rule file", func(t *testing.T) { ··· 102 strip: //footer 103 test_url: https://example.com/article` 104 105 - rule, err := parser.parseRuleFile("example.com", content) 106 if err != nil { 107 t.Fatalf("Expected no error, got %v", err) 108 } ··· 132 body: //article 133 ` 134 135 - rule, err := parser.parseRuleFile("test.com", content) 136 if err != nil { 137 t.Fatalf("Expected no error, got %v", err) 138 } ··· 173 }) 174 175 t.Run("Convert", func(t *testing.T) { 176 - parser, err := NewArticleParser() 177 if err != nil { 178 t.Fatalf("Failed to create parser: %v", err) 179 } ··· 242 t.Error("Expected stripped content to be removed from markdown") 243 } 244 }) 245 }) 246 }

··· 2 3 import ( 4 "fmt" 5 + "net/http" 6 + "net/http/httptest" 7 "os" 8 "strings" 9 "testing" ··· 11 12 // ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules. 13 func ExampleParser_Convert() { 14 + parser, err := NewArticleParser(http.DefaultClient) 15 if err != nil { 16 fmt.Printf("Failed to create parser: %v\n", err) 17 return ··· 54 func TestArticleParser(t *testing.T) { 55 t.Run("New", func(t *testing.T) { 56 t.Run("successfully creates parser", func(t *testing.T) { 57 + parser, err := NewArticleParser(http.DefaultClient) 58 if err != nil { 59 t.Fatalf("Expected no error, got %v", err) 60 } ··· 67 }) 68 69 t.Run("loads expected domains", func(t *testing.T) { 70 + parser, err := NewArticleParser(http.DefaultClient) 71 if err != nil { 72 t.Fatalf("Failed to create parser: %v", err) 73 } ··· 92 }) 93 }) 94 95 + t.Run("parseRules", func(t *testing.T) { 96 parser := &ArticleParser{rules: make(map[string]*ParsingRule)} 97 98 t.Run("parses valid rule file", func(t *testing.T) { ··· 104 strip: //footer 105 test_url: https://example.com/article` 106 107 + rule, err := parser.parseRules("example.com", content) 108 if err != nil { 109 t.Fatalf("Expected no error, got %v", err) 110 } ··· 134 body: //article 135 ` 136 137 + rule, err := parser.parseRules("test.com", content) 138 if err != nil { 139 t.Fatalf("Expected no error, got %v", err) 140 } ··· 175 }) 176 177 t.Run("Convert", func(t *testing.T) { 178 + parser, err := NewArticleParser(http.DefaultClient) 179 if err != nil { 180 t.Fatalf("Failed to create parser: %v", err) 181 } ··· 244 t.Error("Expected stripped content to be removed from markdown") 245 } 246 }) 247 + }) 248 + 249 + t.Run("ParseURL", func(t *testing.T) { 250 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 251 + switch { 252 + case strings.Contains(r.URL.Path, "404"): 253 + w.WriteHeader(http.StatusNotFound) 254 + case strings.Contains(r.URL.Path, "unsupported"): 255 + w.WriteHeader(http.StatusOK) 256 + w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>")) 257 + default: 258 + // Return Wikipedia-like structure for localhost rule 259 + w.WriteHeader(http.StatusOK) 260 + w.Write([]byte(`<html> 261 + <head><title>Test Article</title></head> 262 + <body> 263 + <h1 id="firstHeading">Test Wikipedia Article</h1> 264 + <div id="bodyContent"> 265 + <p>This is the article content.</p> 266 + <div class="noprint">This gets stripped</div> 267 + </div> 268 + </body> 269 + </html>`)) 270 + } 271 + })) 272 + defer server.Close() 273 + 274 + parser, err := NewArticleParser(server.Client()) 275 + if err != nil { 276 + t.Fatalf("Failed to create parser: %v", err) 277 + } 278 + 279 + localhostRule := &ParsingRule{ 280 + Domain: "127.0.0.1", 281 + Title: "//h1[@id='firstHeading']", 282 + Body: "//div[@id='bodyContent']", 283 + Strip: []string{"//div[@class='noprint']"}, 284 + } 285 + parser.AddRule("127.0.0.1", localhostRule) 286 + 287 + t.Run("fails with invalid URL", func(t *testing.T) { 288 + _, err := parser.ParseURL("not-a-url") 289 + if err == nil { 290 + t.Error("Expected error for invalid URL") 291 + } 292 + if !strings.Contains(err.Error(), "unsupported protocol scheme") { 293 + t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err) 294 + } 295 + }) 296 + 297 + t.Run("fails with unsupported domain", func(t *testing.T) { 298 + _, err := parser.ParseURL(server.URL + "/unsupported.com") 299 + if err == nil { 300 + t.Error("Expected error for unsupported domain") 301 + } 302 + }) 303 + 304 + t.Run("fails with HTTP error", func(t *testing.T) { 305 + _, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test") 306 + if err == nil { 307 + t.Error("Expected error for HTTP 404") 308 + } 309 + }) 310 + 311 + }) 312 + 313 + t.Run("SaveArticle", func(t *testing.T) { 314 + parser := &ArticleParser{} 315 + tempDir := t.TempDir() 316 + 317 + content := &ParsedContent{ 318 + Title: "Test Article", 319 + Author: "Test Author", 320 + Date: "2023-01-01", 321 + Content: "This is test content.", 322 + URL: "https://example.com/test", 323 + } 324 + 325 + t.Run("successfully saves article", func(t *testing.T) { 326 + mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) 327 + if err != nil { 328 + t.Fatalf("Expected no error, got %v", err) 329 + } 330 + 331 + if _, err := os.Stat(mdPath); os.IsNotExist(err) { 332 + t.Error("Expected markdown file to exist") 333 + } 334 + if _, err := os.Stat(htmlPath); os.IsNotExist(err) { 335 + t.Error("Expected HTML file to exist") 336 + } 337 + 338 + mdContent, err := os.ReadFile(mdPath) 339 + if err != nil { 340 + t.Fatalf("Failed to read markdown file: %v", err) 341 + } 342 + if !strings.Contains(string(mdContent), "# Test Article") { 343 + t.Error("Expected markdown to contain title") 344 + } 345 + if !strings.Contains(string(mdContent), "**Author:** Test Author") { 346 + t.Error("Expected markdown to contain author") 347 + } 348 + 349 + htmlContentBytes, err := os.ReadFile(htmlPath) 350 + if err != nil { 351 + t.Fatalf("Failed to read HTML file: %v", err) 352 + } 353 + if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") { 354 + t.Error("Expected HTML to contain title") 355 + } 356 + }) 357 + 358 + t.Run("handles duplicate filenames", func(t *testing.T) { 359 + mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir) 360 + if err != nil { 361 + t.Fatalf("Expected no error for first save, got %v", err) 362 + } 363 + 364 + mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir) 365 + if err != nil { 366 + t.Fatalf("Expected no error for second save, got %v", err) 367 + } 368 + 369 + if mdPath1 == mdPath2 { 370 + t.Error("Expected different markdown paths for duplicate saves") 371 + } 372 + if htmlPath1 == htmlPath2 { 373 + t.Error("Expected different HTML paths for duplicate saves") 374 + } 375 + 376 + if _, err := os.Stat(mdPath1); os.IsNotExist(err) { 377 + t.Error("Expected first markdown file to exist") 378 + } 379 + if _, err := os.Stat(mdPath2); os.IsNotExist(err) { 380 + t.Error("Expected second markdown file to exist") 381 + } 382 + }) 383 + 384 + t.Run("fails with invalid directory", func(t *testing.T) { 385 + invalidDir := "/nonexistent/directory" 386 + _, _, err := parser.SaveArticle(content, invalidDir) 387 + if err == nil { 388 + t.Error("Expected error for invalid directory") 389 + } 390 + }) 391 + }) 392 + 393 + t.Run("createHTML", func(t *testing.T) { 394 + parser := &ArticleParser{} 395 + content := &ParsedContent{ 396 + Title: "Test HTML Article", 397 + Author: "HTML Author", 398 + Date: "2023-12-25", 399 + Content: "This is **bold** content with *emphasis*.", 400 + URL: "https://example.com/html-test", 401 + } 402 + 403 + t.Run("creates valid HTML", func(t *testing.T) { 404 + markdown := parser.createMarkdown(content) 405 + html := parser.createHTML(content, markdown) 406 + 407 + if !strings.Contains(html, "<!DOCTYPE html>") { 408 + t.Error("Expected HTML to contain DOCTYPE") 409 + } 410 + if !strings.Contains(html, "<title>Test HTML Article</title>") { 411 + t.Error("Expected HTML to contain title") 412 + } 413 + if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") { 414 + t.Error("Expected HTML to contain h1 heading with title") 415 + } 416 + if !strings.Contains(html, "<strong>bold</strong>") { 417 + t.Error("Expected HTML to contain bold formatting") 418 + } 419 + if !strings.Contains(html, "<em>emphasis</em>") { 420 + t.Error("Expected HTML to contain emphasis formatting") 421 + } 422 + }) 423 + }) 424 + 425 + t.Run("createMarkdown", func(t *testing.T) { 426 + parser := &ArticleParser{} 427 + 428 + t.Run("creates markdown with all fields", func(t *testing.T) { 429 + content := &ParsedContent{ 430 + Title: "Full Content Article", 431 + Author: "Complete Author", 432 + Date: "2023-01-15", 433 + Content: "Complete article content here.", 434 + URL: "https://example.com/full", 435 + } 436 + 437 + markdown := parser.createMarkdown(content) 438 + 439 + if !strings.Contains(markdown, "# Full Content Article") { 440 + t.Error("Expected markdown to contain title") 441 + } 442 + if !strings.Contains(markdown, "**Author:** Complete Author") { 443 + t.Error("Expected markdown to contain author") 444 + } 445 + if !strings.Contains(markdown, "**Date:** 2023-01-15") { 446 + t.Error("Expected markdown to contain date") 447 + } 448 + if !strings.Contains(markdown, "**Source:** https://example.com/full") { 449 + t.Error("Expected markdown to contain source URL") 450 + } 451 + if !strings.Contains(markdown, "**Saved:**") { 452 + t.Error("Expected markdown to contain saved timestamp") 453 + } 454 + if !strings.Contains(markdown, "---") { 455 + t.Error("Expected markdown to contain separator") 456 + } 457 + if !strings.Contains(markdown, "Complete article content here.") { 458 + t.Error("Expected markdown to contain article content") 459 + } 460 + }) 461 + 462 + t.Run("creates markdown with minimal fields", func(t *testing.T) { 463 + content := &ParsedContent{ 464 + Title: "Minimal Article", 465 + Content: "Just content.", 466 + URL: "https://example.com/minimal", 467 + } 468 + 469 + markdown := parser.createMarkdown(content) 470 + 471 + if !strings.Contains(markdown, "# Minimal Article") { 472 + t.Error("Expected markdown to contain title") 473 + } 474 + if strings.Contains(markdown, "**Author:**") { 475 + t.Error("Expected no author field for empty author") 476 + } 477 + if strings.Contains(markdown, "**Date:**") { 478 + t.Error("Expected no date field for empty date") 479 + } 480 + if !strings.Contains(markdown, "**Source:** https://example.com/minimal") { 481 + t.Error("Expected markdown to contain source URL") 482 + } 483 + }) 484 + }) 485 + } 486 + 487 + func TestCreateArticleFromURL(t *testing.T) { 488 + tempDir := t.TempDir() 489 + 490 + t.Run("fails with invalid URL", func(t *testing.T) { 491 + _, err := CreateArticleFromURL("not-a-url", tempDir) 492 + if err == nil { 493 + t.Error("Expected error for invalid URL") 494 + } 495 + }) 496 + 497 + t.Run("fails with unsupported domain", func(t *testing.T) { 498 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 499 + w.WriteHeader(http.StatusOK) 500 + w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>")) 501 + })) 502 + defer server.Close() 503 + 504 + _, err := CreateArticleFromURL(server.URL, tempDir) 505 + if err == nil { 506 + t.Error("Expected error for unsupported domain") 507 + } 508 + }) 509 + 510 + t.Run("successfully creates article from Wikipedia-like URL", func(t *testing.T) { 511 + wikipediaHTML := `<html> 512 + <head><title>Integration Test Article</title></head> 513 + <body> 514 + <h1 id="firstHeading">Integration Test Article</h1> 515 + <div id="bodyContent"> 516 + <p>This is integration test content.</p> 517 + </div> 518 + </body> 519 + </html>` 520 + 521 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 522 + w.WriteHeader(http.StatusOK) 523 + w.Write([]byte(wikipediaHTML)) 524 + })) 525 + defer server.Close() 526 + 527 + // We need to patch the CreateArticleFromURL function to use our test client and rules 528 + // For now, let's test the components individually since CreateArticleFromURL uses NewArticleParser internally 529 + parser, err := NewArticleParser(server.Client()) 530 + if err != nil { 531 + t.Fatalf("Failed to create parser: %v", err) 532 + } 533 + 534 + // Add localhost rule for testing 535 + localhostRule := &ParsingRule{ 536 + Domain: "127.0.0.1", 537 + Title: "//h1[@id='firstHeading']", 538 + Body: "//div[@id='bodyContent']", 539 + Strip: []string{"//div[@class='noprint']"}, 540 + } 541 + parser.AddRule("127.0.0.1", localhostRule) 542 + 543 + content, err := parser.ParseURL(server.URL) 544 + if err != nil { 545 + t.Fatalf("Expected no error, got %v", err) 546 + } 547 + 548 + mdPath, htmlPath, err := parser.SaveArticle(content, tempDir) 549 + if err != nil { 550 + t.Fatalf("Failed to save article: %v", err) 551 + } 552 + 553 + if content.Title != "Integration Test Article" { 554 + t.Errorf("Expected title 'Integration Test Article', got %s", content.Title) 555 + } 556 + if mdPath == "" { 557 + t.Error("Expected non-empty markdown path") 558 + } 559 + if htmlPath == "" { 560 + t.Error("Expected non-empty HTML path") 561 + } 562 + 563 + // Check files exist 564 + if _, err := os.Stat(mdPath); os.IsNotExist(err) { 565 + t.Error("Expected markdown file to exist") 566 + } 567 + if _, err := os.Stat(htmlPath); os.IsNotExist(err) { 568 + t.Error("Expected HTML file to exist") 569 + } 570 }) 571 }