tangled
alpha
login
or
join now
desertthunder.dev
/
noteleaf
cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists ๐
charm
leaflet
readability
golang
29
fork
atom
overview
issues
2
pulls
pipelines
feat: article parser
desertthunder.dev
4 months ago
fad93018
51e8ac11
+354
-21
2 changed files
expand all
collapse all
unified
split
internal
articles
articles.go
articles_test.go
+22
-14
internal/articles/articles.go
···
34
35
// ParsingRule represents XPath rules for extracting content from a specific domain
36
type ParsingRule struct {
37
-
Domain string
38
-
Title string
39
-
Author string
40
-
Date string
41
-
Body string
42
-
Strip []string // XPath selectors for elements to remove
0
43
TestURLs []string
44
}
45
···
57
58
// ArticleParser implements the Parser interface
59
type ArticleParser struct {
60
-
rules map[string]*ParsingRule
0
61
}
62
63
-
// NewArticleParser creates a new ArticleParser with loaded rules
64
-
func NewArticleParser() (*ArticleParser, error) {
65
parser := &ArticleParser{
66
-
rules: make(map[string]*ParsingRule),
0
67
}
68
69
if err := parser.loadRules(); err != nil {
···
73
return parser, nil
74
}
75
0
0
0
0
0
76
func (p *ArticleParser) loadRules() error {
77
entries, err := rulesFS.ReadDir("rules")
78
if err != nil {
···
91
return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err)
92
}
93
94
-
rule, err := p.parseRuleFile(domain, string(content))
95
if err != nil {
96
return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err)
97
}
···
102
return nil
103
}
104
105
-
func (p *ArticleParser) parseRuleFile(domain, content string) (*ParsingRule, error) {
106
rule := &ParsingRule{Domain: domain, Strip: []string{}}
107
scanner := bufio.NewScanner(strings.NewReader(content))
108
for scanner.Scan() {
···
152
153
domain := parsedURL.Hostname()
154
155
-
resp, err := http.Get(urlStr)
156
if err != nil {
157
return nil, fmt.Errorf("failed to fetch URL: %w", err)
158
}
···
362
363
// CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article]
364
func CreateArticleFromURL(url, dir string) (*models.Article, error) {
365
-
parser, err := NewArticleParser()
366
if err != nil {
367
return nil, fmt.Errorf("failed to create parser: %w", err)
368
}
···
34
35
// ParsingRule represents XPath rules for extracting content from a specific domain
36
type ParsingRule struct {
37
+
Domain string
38
+
Title string
39
+
Author string
40
+
Date string
41
+
Body string
42
+
// XPath selectors for elements to remove
43
+
Strip []string
44
TestURLs []string
45
}
46
···
58
59
// ArticleParser implements the Parser interface
60
type ArticleParser struct {
61
+
rules map[string]*ParsingRule
62
+
client *http.Client
63
}
64
65
+
// NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules
66
+
func NewArticleParser(client *http.Client) (*ArticleParser, error) {
67
parser := &ArticleParser{
68
+
rules: make(map[string]*ParsingRule),
69
+
client: client,
70
}
71
72
if err := parser.loadRules(); err != nil {
···
76
return parser, nil
77
}
78
79
+
// AddRule adds or replaces a parsing rule for a specific domain
80
+
func (p *ArticleParser) AddRule(domain string, rule *ParsingRule) {
81
+
p.rules[domain] = rule
82
+
}
83
+
84
func (p *ArticleParser) loadRules() error {
85
entries, err := rulesFS.ReadDir("rules")
86
if err != nil {
···
99
return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err)
100
}
101
102
+
rule, err := p.parseRules(domain, string(content))
103
if err != nil {
104
return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err)
105
}
···
110
return nil
111
}
112
113
+
func (p *ArticleParser) parseRules(domain, content string) (*ParsingRule, error) {
114
rule := &ParsingRule{Domain: domain, Strip: []string{}}
115
scanner := bufio.NewScanner(strings.NewReader(content))
116
for scanner.Scan() {
···
160
161
domain := parsedURL.Hostname()
162
163
+
resp, err := p.client.Get(urlStr)
164
if err != nil {
165
return nil, fmt.Errorf("failed to fetch URL: %w", err)
166
}
···
370
371
// CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article]
372
func CreateArticleFromURL(url, dir string) (*models.Article, error) {
373
+
parser, err := NewArticleParser(http.DefaultClient)
374
if err != nil {
375
return nil, fmt.Errorf("failed to create parser: %w", err)
376
}
+332
-7
internal/articles/articles_test.go
···
2
3
import (
4
"fmt"
0
0
5
"os"
6
"strings"
7
"testing"
···
9
10
// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
11
func ExampleParser_Convert() {
12
-
parser, err := NewArticleParser()
13
if err != nil {
14
fmt.Printf("Failed to create parser: %v\n", err)
15
return
···
52
func TestArticleParser(t *testing.T) {
53
t.Run("New", func(t *testing.T) {
54
t.Run("successfully creates parser", func(t *testing.T) {
55
-
parser, err := NewArticleParser()
56
if err != nil {
57
t.Fatalf("Expected no error, got %v", err)
58
}
···
65
})
66
67
t.Run("loads expected domains", func(t *testing.T) {
68
-
parser, err := NewArticleParser()
69
if err != nil {
70
t.Fatalf("Failed to create parser: %v", err)
71
}
···
90
})
91
})
92
93
-
t.Run("parseRuleFile", func(t *testing.T) {
94
parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
95
96
t.Run("parses valid rule file", func(t *testing.T) {
···
102
strip: //footer
103
test_url: https://example.com/article`
104
105
-
rule, err := parser.parseRuleFile("example.com", content)
106
if err != nil {
107
t.Fatalf("Expected no error, got %v", err)
108
}
···
132
body: //article
133
`
134
135
-
rule, err := parser.parseRuleFile("test.com", content)
136
if err != nil {
137
t.Fatalf("Expected no error, got %v", err)
138
}
···
173
})
174
175
t.Run("Convert", func(t *testing.T) {
176
-
parser, err := NewArticleParser()
177
if err != nil {
178
t.Fatalf("Failed to create parser: %v", err)
179
}
···
242
t.Error("Expected stripped content to be removed from markdown")
243
}
244
})
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
245
})
246
}
···
2
3
import (
4
"fmt"
5
+
"net/http"
6
+
"net/http/httptest"
7
"os"
8
"strings"
9
"testing"
···
11
12
// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
13
func ExampleParser_Convert() {
14
+
parser, err := NewArticleParser(http.DefaultClient)
15
if err != nil {
16
fmt.Printf("Failed to create parser: %v\n", err)
17
return
···
54
func TestArticleParser(t *testing.T) {
55
t.Run("New", func(t *testing.T) {
56
t.Run("successfully creates parser", func(t *testing.T) {
57
+
parser, err := NewArticleParser(http.DefaultClient)
58
if err != nil {
59
t.Fatalf("Expected no error, got %v", err)
60
}
···
67
})
68
69
t.Run("loads expected domains", func(t *testing.T) {
70
+
parser, err := NewArticleParser(http.DefaultClient)
71
if err != nil {
72
t.Fatalf("Failed to create parser: %v", err)
73
}
···
92
})
93
})
94
95
+
t.Run("parseRules", func(t *testing.T) {
96
parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
97
98
t.Run("parses valid rule file", func(t *testing.T) {
···
104
strip: //footer
105
test_url: https://example.com/article`
106
107
+
rule, err := parser.parseRules("example.com", content)
108
if err != nil {
109
t.Fatalf("Expected no error, got %v", err)
110
}
···
134
body: //article
135
`
136
137
+
rule, err := parser.parseRules("test.com", content)
138
if err != nil {
139
t.Fatalf("Expected no error, got %v", err)
140
}
···
175
})
176
177
t.Run("Convert", func(t *testing.T) {
178
+
parser, err := NewArticleParser(http.DefaultClient)
179
if err != nil {
180
t.Fatalf("Failed to create parser: %v", err)
181
}
···
244
t.Error("Expected stripped content to be removed from markdown")
245
}
246
})
247
+
})
248
+
249
+
t.Run("ParseURL", func(t *testing.T) {
250
+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
251
+
switch {
252
+
case strings.Contains(r.URL.Path, "404"):
253
+
w.WriteHeader(http.StatusNotFound)
254
+
case strings.Contains(r.URL.Path, "unsupported"):
255
+
w.WriteHeader(http.StatusOK)
256
+
w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
257
+
default:
258
+
// Return Wikipedia-like structure for localhost rule
259
+
w.WriteHeader(http.StatusOK)
260
+
w.Write([]byte(`<html>
261
+
<head><title>Test Article</title></head>
262
+
<body>
263
+
<h1 id="firstHeading">Test Wikipedia Article</h1>
264
+
<div id="bodyContent">
265
+
<p>This is the article content.</p>
266
+
<div class="noprint">This gets stripped</div>
267
+
</div>
268
+
</body>
269
+
</html>`))
270
+
}
271
+
}))
272
+
defer server.Close()
273
+
274
+
parser, err := NewArticleParser(server.Client())
275
+
if err != nil {
276
+
t.Fatalf("Failed to create parser: %v", err)
277
+
}
278
+
279
+
localhostRule := &ParsingRule{
280
+
Domain: "127.0.0.1",
281
+
Title: "//h1[@id='firstHeading']",
282
+
Body: "//div[@id='bodyContent']",
283
+
Strip: []string{"//div[@class='noprint']"},
284
+
}
285
+
parser.AddRule("127.0.0.1", localhostRule)
286
+
287
+
t.Run("fails with invalid URL", func(t *testing.T) {
288
+
_, err := parser.ParseURL("not-a-url")
289
+
if err == nil {
290
+
t.Error("Expected error for invalid URL")
291
+
}
292
+
if !strings.Contains(err.Error(), "unsupported protocol scheme") {
293
+
t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err)
294
+
}
295
+
})
296
+
297
+
t.Run("fails with unsupported domain", func(t *testing.T) {
298
+
_, err := parser.ParseURL(server.URL + "/unsupported.com")
299
+
if err == nil {
300
+
t.Error("Expected error for unsupported domain")
301
+
}
302
+
})
303
+
304
+
t.Run("fails with HTTP error", func(t *testing.T) {
305
+
_, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test")
306
+
if err == nil {
307
+
t.Error("Expected error for HTTP 404")
308
+
}
309
+
})
310
+
311
+
})
312
+
313
+
t.Run("SaveArticle", func(t *testing.T) {
314
+
parser := &ArticleParser{}
315
+
tempDir := t.TempDir()
316
+
317
+
content := &ParsedContent{
318
+
Title: "Test Article",
319
+
Author: "Test Author",
320
+
Date: "2023-01-01",
321
+
Content: "This is test content.",
322
+
URL: "https://example.com/test",
323
+
}
324
+
325
+
t.Run("successfully saves article", func(t *testing.T) {
326
+
mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
327
+
if err != nil {
328
+
t.Fatalf("Expected no error, got %v", err)
329
+
}
330
+
331
+
if _, err := os.Stat(mdPath); os.IsNotExist(err) {
332
+
t.Error("Expected markdown file to exist")
333
+
}
334
+
if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
335
+
t.Error("Expected HTML file to exist")
336
+
}
337
+
338
+
mdContent, err := os.ReadFile(mdPath)
339
+
if err != nil {
340
+
t.Fatalf("Failed to read markdown file: %v", err)
341
+
}
342
+
if !strings.Contains(string(mdContent), "# Test Article") {
343
+
t.Error("Expected markdown to contain title")
344
+
}
345
+
if !strings.Contains(string(mdContent), "**Author:** Test Author") {
346
+
t.Error("Expected markdown to contain author")
347
+
}
348
+
349
+
htmlContentBytes, err := os.ReadFile(htmlPath)
350
+
if err != nil {
351
+
t.Fatalf("Failed to read HTML file: %v", err)
352
+
}
353
+
if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") {
354
+
t.Error("Expected HTML to contain title")
355
+
}
356
+
})
357
+
358
+
t.Run("handles duplicate filenames", func(t *testing.T) {
359
+
mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir)
360
+
if err != nil {
361
+
t.Fatalf("Expected no error for first save, got %v", err)
362
+
}
363
+
364
+
mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir)
365
+
if err != nil {
366
+
t.Fatalf("Expected no error for second save, got %v", err)
367
+
}
368
+
369
+
if mdPath1 == mdPath2 {
370
+
t.Error("Expected different markdown paths for duplicate saves")
371
+
}
372
+
if htmlPath1 == htmlPath2 {
373
+
t.Error("Expected different HTML paths for duplicate saves")
374
+
}
375
+
376
+
if _, err := os.Stat(mdPath1); os.IsNotExist(err) {
377
+
t.Error("Expected first markdown file to exist")
378
+
}
379
+
if _, err := os.Stat(mdPath2); os.IsNotExist(err) {
380
+
t.Error("Expected second markdown file to exist")
381
+
}
382
+
})
383
+
384
+
t.Run("fails with invalid directory", func(t *testing.T) {
385
+
invalidDir := "/nonexistent/directory"
386
+
_, _, err := parser.SaveArticle(content, invalidDir)
387
+
if err == nil {
388
+
t.Error("Expected error for invalid directory")
389
+
}
390
+
})
391
+
})
392
+
393
+
t.Run("createHTML", func(t *testing.T) {
394
+
parser := &ArticleParser{}
395
+
content := &ParsedContent{
396
+
Title: "Test HTML Article",
397
+
Author: "HTML Author",
398
+
Date: "2023-12-25",
399
+
Content: "This is **bold** content with *emphasis*.",
400
+
URL: "https://example.com/html-test",
401
+
}
402
+
403
+
t.Run("creates valid HTML", func(t *testing.T) {
404
+
markdown := parser.createMarkdown(content)
405
+
html := parser.createHTML(content, markdown)
406
+
407
+
if !strings.Contains(html, "<!DOCTYPE html>") {
408
+
t.Error("Expected HTML to contain DOCTYPE")
409
+
}
410
+
if !strings.Contains(html, "<title>Test HTML Article</title>") {
411
+
t.Error("Expected HTML to contain title")
412
+
}
413
+
if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") {
414
+
t.Error("Expected HTML to contain h1 heading with title")
415
+
}
416
+
if !strings.Contains(html, "<strong>bold</strong>") {
417
+
t.Error("Expected HTML to contain bold formatting")
418
+
}
419
+
if !strings.Contains(html, "<em>emphasis</em>") {
420
+
t.Error("Expected HTML to contain emphasis formatting")
421
+
}
422
+
})
423
+
})
424
+
425
+
t.Run("createMarkdown", func(t *testing.T) {
426
+
parser := &ArticleParser{}
427
+
428
+
t.Run("creates markdown with all fields", func(t *testing.T) {
429
+
content := &ParsedContent{
430
+
Title: "Full Content Article",
431
+
Author: "Complete Author",
432
+
Date: "2023-01-15",
433
+
Content: "Complete article content here.",
434
+
URL: "https://example.com/full",
435
+
}
436
+
437
+
markdown := parser.createMarkdown(content)
438
+
439
+
if !strings.Contains(markdown, "# Full Content Article") {
440
+
t.Error("Expected markdown to contain title")
441
+
}
442
+
if !strings.Contains(markdown, "**Author:** Complete Author") {
443
+
t.Error("Expected markdown to contain author")
444
+
}
445
+
if !strings.Contains(markdown, "**Date:** 2023-01-15") {
446
+
t.Error("Expected markdown to contain date")
447
+
}
448
+
if !strings.Contains(markdown, "**Source:** https://example.com/full") {
449
+
t.Error("Expected markdown to contain source URL")
450
+
}
451
+
if !strings.Contains(markdown, "**Saved:**") {
452
+
t.Error("Expected markdown to contain saved timestamp")
453
+
}
454
+
if !strings.Contains(markdown, "---") {
455
+
t.Error("Expected markdown to contain separator")
456
+
}
457
+
if !strings.Contains(markdown, "Complete article content here.") {
458
+
t.Error("Expected markdown to contain article content")
459
+
}
460
+
})
461
+
462
+
t.Run("creates markdown with minimal fields", func(t *testing.T) {
463
+
content := &ParsedContent{
464
+
Title: "Minimal Article",
465
+
Content: "Just content.",
466
+
URL: "https://example.com/minimal",
467
+
}
468
+
469
+
markdown := parser.createMarkdown(content)
470
+
471
+
if !strings.Contains(markdown, "# Minimal Article") {
472
+
t.Error("Expected markdown to contain title")
473
+
}
474
+
if strings.Contains(markdown, "**Author:**") {
475
+
t.Error("Expected no author field for empty author")
476
+
}
477
+
if strings.Contains(markdown, "**Date:**") {
478
+
t.Error("Expected no date field for empty date")
479
+
}
480
+
if !strings.Contains(markdown, "**Source:** https://example.com/minimal") {
481
+
t.Error("Expected markdown to contain source URL")
482
+
}
483
+
})
484
+
})
485
+
}
486
+
487
+
func TestCreateArticleFromURL(t *testing.T) {
488
+
tempDir := t.TempDir()
489
+
490
+
t.Run("fails with invalid URL", func(t *testing.T) {
491
+
_, err := CreateArticleFromURL("not-a-url", tempDir)
492
+
if err == nil {
493
+
t.Error("Expected error for invalid URL")
494
+
}
495
+
})
496
+
497
+
t.Run("fails with unsupported domain", func(t *testing.T) {
498
+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
499
+
w.WriteHeader(http.StatusOK)
500
+
w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
501
+
}))
502
+
defer server.Close()
503
+
504
+
_, err := CreateArticleFromURL(server.URL, tempDir)
505
+
if err == nil {
506
+
t.Error("Expected error for unsupported domain")
507
+
}
508
+
})
509
+
510
+
t.Run("successfully creates article from Wikipedia-like URL", func(t *testing.T) {
511
+
wikipediaHTML := `<html>
512
+
<head><title>Integration Test Article</title></head>
513
+
<body>
514
+
<h1 id="firstHeading">Integration Test Article</h1>
515
+
<div id="bodyContent">
516
+
<p>This is integration test content.</p>
517
+
</div>
518
+
</body>
519
+
</html>`
520
+
521
+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
522
+
w.WriteHeader(http.StatusOK)
523
+
w.Write([]byte(wikipediaHTML))
524
+
}))
525
+
defer server.Close()
526
+
527
+
// We need to patch the CreateArticleFromURL function to use our test client and rules
528
+
// For now, let's test the components individually since CreateArticleFromURL uses NewArticleParser internally
529
+
parser, err := NewArticleParser(server.Client())
530
+
if err != nil {
531
+
t.Fatalf("Failed to create parser: %v", err)
532
+
}
533
+
534
+
// Add localhost rule for testing
535
+
localhostRule := &ParsingRule{
536
+
Domain: "127.0.0.1",
537
+
Title: "//h1[@id='firstHeading']",
538
+
Body: "//div[@id='bodyContent']",
539
+
Strip: []string{"//div[@class='noprint']"},
540
+
}
541
+
parser.AddRule("127.0.0.1", localhostRule)
542
+
543
+
content, err := parser.ParseURL(server.URL)
544
+
if err != nil {
545
+
t.Fatalf("Expected no error, got %v", err)
546
+
}
547
+
548
+
mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
549
+
if err != nil {
550
+
t.Fatalf("Failed to save article: %v", err)
551
+
}
552
+
553
+
if content.Title != "Integration Test Article" {
554
+
t.Errorf("Expected title 'Integration Test Article', got %s", content.Title)
555
+
}
556
+
if mdPath == "" {
557
+
t.Error("Expected non-empty markdown path")
558
+
}
559
+
if htmlPath == "" {
560
+
t.Error("Expected non-empty HTML path")
561
+
}
562
+
563
+
// Check files exist
564
+
if _, err := os.Stat(mdPath); os.IsNotExist(err) {
565
+
t.Error("Expected markdown file to exist")
566
+
}
567
+
if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
568
+
t.Error("Expected HTML file to exist")
569
+
}
570
})
571
}