tangled
alpha
login
or
join now
desertthunder.dev
/
noteleaf
cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists ๐
charm
leaflet
readability
golang
29
fork
atom
overview
issues
2
pulls
pipelines
feat: article parser
desertthunder.dev
4 months ago
fad93018
51e8ac11
+354
-21
2 changed files
expand all
collapse all
unified
split
internal
articles
articles.go
articles_test.go
+22
-14
internal/articles/articles.go
···
34
34
35
35
// ParsingRule represents XPath rules for extracting content from a specific domain
36
36
type ParsingRule struct {
37
37
-
Domain string
38
38
-
Title string
39
39
-
Author string
40
40
-
Date string
41
41
-
Body string
42
42
-
Strip []string // XPath selectors for elements to remove
37
37
+
Domain string
38
38
+
Title string
39
39
+
Author string
40
40
+
Date string
41
41
+
Body string
42
42
+
// XPath selectors for elements to remove
43
43
+
Strip []string
43
44
TestURLs []string
44
45
}
45
46
···
57
58
58
59
// ArticleParser implements the Parser interface
59
60
type ArticleParser struct {
60
60
-
rules map[string]*ParsingRule
61
61
+
rules map[string]*ParsingRule
62
62
+
client *http.Client
61
63
}
62
64
63
63
-
// NewArticleParser creates a new ArticleParser with loaded rules
64
64
-
func NewArticleParser() (*ArticleParser, error) {
65
65
+
// NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules
66
66
+
func NewArticleParser(client *http.Client) (*ArticleParser, error) {
65
67
parser := &ArticleParser{
66
66
-
rules: make(map[string]*ParsingRule),
68
68
+
rules: make(map[string]*ParsingRule),
69
69
+
client: client,
67
70
}
68
71
69
72
if err := parser.loadRules(); err != nil {
···
73
76
return parser, nil
74
77
}
75
78
79
79
+
// AddRule adds or replaces a parsing rule for a specific domain
80
80
+
func (p *ArticleParser) AddRule(domain string, rule *ParsingRule) {
81
81
+
p.rules[domain] = rule
82
82
+
}
83
83
+
76
84
func (p *ArticleParser) loadRules() error {
77
85
entries, err := rulesFS.ReadDir("rules")
78
86
if err != nil {
···
91
99
return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err)
92
100
}
93
101
94
94
-
rule, err := p.parseRuleFile(domain, string(content))
102
102
+
rule, err := p.parseRules(domain, string(content))
95
103
if err != nil {
96
104
return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err)
97
105
}
···
102
110
return nil
103
111
}
104
112
105
105
-
func (p *ArticleParser) parseRuleFile(domain, content string) (*ParsingRule, error) {
113
113
+
func (p *ArticleParser) parseRules(domain, content string) (*ParsingRule, error) {
106
114
rule := &ParsingRule{Domain: domain, Strip: []string{}}
107
115
scanner := bufio.NewScanner(strings.NewReader(content))
108
116
for scanner.Scan() {
···
152
160
153
161
domain := parsedURL.Hostname()
154
162
155
155
-
resp, err := http.Get(urlStr)
163
163
+
resp, err := p.client.Get(urlStr)
156
164
if err != nil {
157
165
return nil, fmt.Errorf("failed to fetch URL: %w", err)
158
166
}
···
362
370
363
371
// CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article]
364
372
func CreateArticleFromURL(url, dir string) (*models.Article, error) {
365
365
-
parser, err := NewArticleParser()
373
373
+
parser, err := NewArticleParser(http.DefaultClient)
366
374
if err != nil {
367
375
return nil, fmt.Errorf("failed to create parser: %w", err)
368
376
}
+332
-7
internal/articles/articles_test.go
···
2
2
3
3
import (
4
4
"fmt"
5
5
+
"net/http"
6
6
+
"net/http/httptest"
5
7
"os"
6
8
"strings"
7
9
"testing"
···
9
11
10
12
// ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules.
11
13
func ExampleParser_Convert() {
12
12
-
parser, err := NewArticleParser()
14
14
+
parser, err := NewArticleParser(http.DefaultClient)
13
15
if err != nil {
14
16
fmt.Printf("Failed to create parser: %v\n", err)
15
17
return
···
52
54
func TestArticleParser(t *testing.T) {
53
55
t.Run("New", func(t *testing.T) {
54
56
t.Run("successfully creates parser", func(t *testing.T) {
55
55
-
parser, err := NewArticleParser()
57
57
+
parser, err := NewArticleParser(http.DefaultClient)
56
58
if err != nil {
57
59
t.Fatalf("Expected no error, got %v", err)
58
60
}
···
65
67
})
66
68
67
69
t.Run("loads expected domains", func(t *testing.T) {
68
68
-
parser, err := NewArticleParser()
70
70
+
parser, err := NewArticleParser(http.DefaultClient)
69
71
if err != nil {
70
72
t.Fatalf("Failed to create parser: %v", err)
71
73
}
···
90
92
})
91
93
})
92
94
93
93
-
t.Run("parseRuleFile", func(t *testing.T) {
95
95
+
t.Run("parseRules", func(t *testing.T) {
94
96
parser := &ArticleParser{rules: make(map[string]*ParsingRule)}
95
97
96
98
t.Run("parses valid rule file", func(t *testing.T) {
···
102
104
strip: //footer
103
105
test_url: https://example.com/article`
104
106
105
105
-
rule, err := parser.parseRuleFile("example.com", content)
107
107
+
rule, err := parser.parseRules("example.com", content)
106
108
if err != nil {
107
109
t.Fatalf("Expected no error, got %v", err)
108
110
}
···
132
134
body: //article
133
135
`
134
136
135
135
-
rule, err := parser.parseRuleFile("test.com", content)
137
137
+
rule, err := parser.parseRules("test.com", content)
136
138
if err != nil {
137
139
t.Fatalf("Expected no error, got %v", err)
138
140
}
···
173
175
})
174
176
175
177
t.Run("Convert", func(t *testing.T) {
176
176
-
parser, err := NewArticleParser()
178
178
+
parser, err := NewArticleParser(http.DefaultClient)
177
179
if err != nil {
178
180
t.Fatalf("Failed to create parser: %v", err)
179
181
}
···
242
244
t.Error("Expected stripped content to be removed from markdown")
243
245
}
244
246
})
247
247
+
})
248
248
+
249
249
+
t.Run("ParseURL", func(t *testing.T) {
250
250
+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
251
251
+
switch {
252
252
+
case strings.Contains(r.URL.Path, "404"):
253
253
+
w.WriteHeader(http.StatusNotFound)
254
254
+
case strings.Contains(r.URL.Path, "unsupported"):
255
255
+
w.WriteHeader(http.StatusOK)
256
256
+
w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
257
257
+
default:
258
258
+
// Return Wikipedia-like structure for localhost rule
259
259
+
w.WriteHeader(http.StatusOK)
260
260
+
w.Write([]byte(`<html>
261
261
+
<head><title>Test Article</title></head>
262
262
+
<body>
263
263
+
<h1 id="firstHeading">Test Wikipedia Article</h1>
264
264
+
<div id="bodyContent">
265
265
+
<p>This is the article content.</p>
266
266
+
<div class="noprint">This gets stripped</div>
267
267
+
</div>
268
268
+
</body>
269
269
+
</html>`))
270
270
+
}
271
271
+
}))
272
272
+
defer server.Close()
273
273
+
274
274
+
parser, err := NewArticleParser(server.Client())
275
275
+
if err != nil {
276
276
+
t.Fatalf("Failed to create parser: %v", err)
277
277
+
}
278
278
+
279
279
+
localhostRule := &ParsingRule{
280
280
+
Domain: "127.0.0.1",
281
281
+
Title: "//h1[@id='firstHeading']",
282
282
+
Body: "//div[@id='bodyContent']",
283
283
+
Strip: []string{"//div[@class='noprint']"},
284
284
+
}
285
285
+
parser.AddRule("127.0.0.1", localhostRule)
286
286
+
287
287
+
t.Run("fails with invalid URL", func(t *testing.T) {
288
288
+
_, err := parser.ParseURL("not-a-url")
289
289
+
if err == nil {
290
290
+
t.Error("Expected error for invalid URL")
291
291
+
}
292
292
+
if !strings.Contains(err.Error(), "unsupported protocol scheme") {
293
293
+
t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err)
294
294
+
}
295
295
+
})
296
296
+
297
297
+
t.Run("fails with unsupported domain", func(t *testing.T) {
298
298
+
_, err := parser.ParseURL(server.URL + "/unsupported.com")
299
299
+
if err == nil {
300
300
+
t.Error("Expected error for unsupported domain")
301
301
+
}
302
302
+
})
303
303
+
304
304
+
t.Run("fails with HTTP error", func(t *testing.T) {
305
305
+
_, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test")
306
306
+
if err == nil {
307
307
+
t.Error("Expected error for HTTP 404")
308
308
+
}
309
309
+
})
310
310
+
311
311
+
})
312
312
+
313
313
+
t.Run("SaveArticle", func(t *testing.T) {
314
314
+
parser := &ArticleParser{}
315
315
+
tempDir := t.TempDir()
316
316
+
317
317
+
content := &ParsedContent{
318
318
+
Title: "Test Article",
319
319
+
Author: "Test Author",
320
320
+
Date: "2023-01-01",
321
321
+
Content: "This is test content.",
322
322
+
URL: "https://example.com/test",
323
323
+
}
324
324
+
325
325
+
t.Run("successfully saves article", func(t *testing.T) {
326
326
+
mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
327
327
+
if err != nil {
328
328
+
t.Fatalf("Expected no error, got %v", err)
329
329
+
}
330
330
+
331
331
+
if _, err := os.Stat(mdPath); os.IsNotExist(err) {
332
332
+
t.Error("Expected markdown file to exist")
333
333
+
}
334
334
+
if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
335
335
+
t.Error("Expected HTML file to exist")
336
336
+
}
337
337
+
338
338
+
mdContent, err := os.ReadFile(mdPath)
339
339
+
if err != nil {
340
340
+
t.Fatalf("Failed to read markdown file: %v", err)
341
341
+
}
342
342
+
if !strings.Contains(string(mdContent), "# Test Article") {
343
343
+
t.Error("Expected markdown to contain title")
344
344
+
}
345
345
+
if !strings.Contains(string(mdContent), "**Author:** Test Author") {
346
346
+
t.Error("Expected markdown to contain author")
347
347
+
}
348
348
+
349
349
+
htmlContentBytes, err := os.ReadFile(htmlPath)
350
350
+
if err != nil {
351
351
+
t.Fatalf("Failed to read HTML file: %v", err)
352
352
+
}
353
353
+
if !strings.Contains(string(htmlContentBytes), "<title>Test Article</title>") {
354
354
+
t.Error("Expected HTML to contain title")
355
355
+
}
356
356
+
})
357
357
+
358
358
+
t.Run("handles duplicate filenames", func(t *testing.T) {
359
359
+
mdPath1, htmlPath1, err := parser.SaveArticle(content, tempDir)
360
360
+
if err != nil {
361
361
+
t.Fatalf("Expected no error for first save, got %v", err)
362
362
+
}
363
363
+
364
364
+
mdPath2, htmlPath2, err := parser.SaveArticle(content, tempDir)
365
365
+
if err != nil {
366
366
+
t.Fatalf("Expected no error for second save, got %v", err)
367
367
+
}
368
368
+
369
369
+
if mdPath1 == mdPath2 {
370
370
+
t.Error("Expected different markdown paths for duplicate saves")
371
371
+
}
372
372
+
if htmlPath1 == htmlPath2 {
373
373
+
t.Error("Expected different HTML paths for duplicate saves")
374
374
+
}
375
375
+
376
376
+
if _, err := os.Stat(mdPath1); os.IsNotExist(err) {
377
377
+
t.Error("Expected first markdown file to exist")
378
378
+
}
379
379
+
if _, err := os.Stat(mdPath2); os.IsNotExist(err) {
380
380
+
t.Error("Expected second markdown file to exist")
381
381
+
}
382
382
+
})
383
383
+
384
384
+
t.Run("fails with invalid directory", func(t *testing.T) {
385
385
+
invalidDir := "/nonexistent/directory"
386
386
+
_, _, err := parser.SaveArticle(content, invalidDir)
387
387
+
if err == nil {
388
388
+
t.Error("Expected error for invalid directory")
389
389
+
}
390
390
+
})
391
391
+
})
392
392
+
393
393
+
t.Run("createHTML", func(t *testing.T) {
394
394
+
parser := &ArticleParser{}
395
395
+
content := &ParsedContent{
396
396
+
Title: "Test HTML Article",
397
397
+
Author: "HTML Author",
398
398
+
Date: "2023-12-25",
399
399
+
Content: "This is **bold** content with *emphasis*.",
400
400
+
URL: "https://example.com/html-test",
401
401
+
}
402
402
+
403
403
+
t.Run("creates valid HTML", func(t *testing.T) {
404
404
+
markdown := parser.createMarkdown(content)
405
405
+
html := parser.createHTML(content, markdown)
406
406
+
407
407
+
if !strings.Contains(html, "<!DOCTYPE html>") {
408
408
+
t.Error("Expected HTML to contain DOCTYPE")
409
409
+
}
410
410
+
if !strings.Contains(html, "<title>Test HTML Article</title>") {
411
411
+
t.Error("Expected HTML to contain title")
412
412
+
}
413
413
+
if !strings.Contains(html, "<h1") || !strings.Contains(html, "Test HTML Article") {
414
414
+
t.Error("Expected HTML to contain h1 heading with title")
415
415
+
}
416
416
+
if !strings.Contains(html, "<strong>bold</strong>") {
417
417
+
t.Error("Expected HTML to contain bold formatting")
418
418
+
}
419
419
+
if !strings.Contains(html, "<em>emphasis</em>") {
420
420
+
t.Error("Expected HTML to contain emphasis formatting")
421
421
+
}
422
422
+
})
423
423
+
})
424
424
+
425
425
+
t.Run("createMarkdown", func(t *testing.T) {
426
426
+
parser := &ArticleParser{}
427
427
+
428
428
+
t.Run("creates markdown with all fields", func(t *testing.T) {
429
429
+
content := &ParsedContent{
430
430
+
Title: "Full Content Article",
431
431
+
Author: "Complete Author",
432
432
+
Date: "2023-01-15",
433
433
+
Content: "Complete article content here.",
434
434
+
URL: "https://example.com/full",
435
435
+
}
436
436
+
437
437
+
markdown := parser.createMarkdown(content)
438
438
+
439
439
+
if !strings.Contains(markdown, "# Full Content Article") {
440
440
+
t.Error("Expected markdown to contain title")
441
441
+
}
442
442
+
if !strings.Contains(markdown, "**Author:** Complete Author") {
443
443
+
t.Error("Expected markdown to contain author")
444
444
+
}
445
445
+
if !strings.Contains(markdown, "**Date:** 2023-01-15") {
446
446
+
t.Error("Expected markdown to contain date")
447
447
+
}
448
448
+
if !strings.Contains(markdown, "**Source:** https://example.com/full") {
449
449
+
t.Error("Expected markdown to contain source URL")
450
450
+
}
451
451
+
if !strings.Contains(markdown, "**Saved:**") {
452
452
+
t.Error("Expected markdown to contain saved timestamp")
453
453
+
}
454
454
+
if !strings.Contains(markdown, "---") {
455
455
+
t.Error("Expected markdown to contain separator")
456
456
+
}
457
457
+
if !strings.Contains(markdown, "Complete article content here.") {
458
458
+
t.Error("Expected markdown to contain article content")
459
459
+
}
460
460
+
})
461
461
+
462
462
+
t.Run("creates markdown with minimal fields", func(t *testing.T) {
463
463
+
content := &ParsedContent{
464
464
+
Title: "Minimal Article",
465
465
+
Content: "Just content.",
466
466
+
URL: "https://example.com/minimal",
467
467
+
}
468
468
+
469
469
+
markdown := parser.createMarkdown(content)
470
470
+
471
471
+
if !strings.Contains(markdown, "# Minimal Article") {
472
472
+
t.Error("Expected markdown to contain title")
473
473
+
}
474
474
+
if strings.Contains(markdown, "**Author:**") {
475
475
+
t.Error("Expected no author field for empty author")
476
476
+
}
477
477
+
if strings.Contains(markdown, "**Date:**") {
478
478
+
t.Error("Expected no date field for empty date")
479
479
+
}
480
480
+
if !strings.Contains(markdown, "**Source:** https://example.com/minimal") {
481
481
+
t.Error("Expected markdown to contain source URL")
482
482
+
}
483
483
+
})
484
484
+
})
485
485
+
}
486
486
+
487
487
+
func TestCreateArticleFromURL(t *testing.T) {
488
488
+
tempDir := t.TempDir()
489
489
+
490
490
+
t.Run("fails with invalid URL", func(t *testing.T) {
491
491
+
_, err := CreateArticleFromURL("not-a-url", tempDir)
492
492
+
if err == nil {
493
493
+
t.Error("Expected error for invalid URL")
494
494
+
}
495
495
+
})
496
496
+
497
497
+
t.Run("fails with unsupported domain", func(t *testing.T) {
498
498
+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
499
499
+
w.WriteHeader(http.StatusOK)
500
500
+
w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>"))
501
501
+
}))
502
502
+
defer server.Close()
503
503
+
504
504
+
_, err := CreateArticleFromURL(server.URL, tempDir)
505
505
+
if err == nil {
506
506
+
t.Error("Expected error for unsupported domain")
507
507
+
}
508
508
+
})
509
509
+
510
510
+
t.Run("successfully creates article from Wikipedia-like URL", func(t *testing.T) {
511
511
+
wikipediaHTML := `<html>
512
512
+
<head><title>Integration Test Article</title></head>
513
513
+
<body>
514
514
+
<h1 id="firstHeading">Integration Test Article</h1>
515
515
+
<div id="bodyContent">
516
516
+
<p>This is integration test content.</p>
517
517
+
</div>
518
518
+
</body>
519
519
+
</html>`
520
520
+
521
521
+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
522
522
+
w.WriteHeader(http.StatusOK)
523
523
+
w.Write([]byte(wikipediaHTML))
524
524
+
}))
525
525
+
defer server.Close()
526
526
+
527
527
+
// We need to patch the CreateArticleFromURL function to use our test client and rules
528
528
+
// For now, let's test the components individually since CreateArticleFromURL uses NewArticleParser internally
529
529
+
parser, err := NewArticleParser(server.Client())
530
530
+
if err != nil {
531
531
+
t.Fatalf("Failed to create parser: %v", err)
532
532
+
}
533
533
+
534
534
+
// Add localhost rule for testing
535
535
+
localhostRule := &ParsingRule{
536
536
+
Domain: "127.0.0.1",
537
537
+
Title: "//h1[@id='firstHeading']",
538
538
+
Body: "//div[@id='bodyContent']",
539
539
+
Strip: []string{"//div[@class='noprint']"},
540
540
+
}
541
541
+
parser.AddRule("127.0.0.1", localhostRule)
542
542
+
543
543
+
content, err := parser.ParseURL(server.URL)
544
544
+
if err != nil {
545
545
+
t.Fatalf("Expected no error, got %v", err)
546
546
+
}
547
547
+
548
548
+
mdPath, htmlPath, err := parser.SaveArticle(content, tempDir)
549
549
+
if err != nil {
550
550
+
t.Fatalf("Failed to save article: %v", err)
551
551
+
}
552
552
+
553
553
+
if content.Title != "Integration Test Article" {
554
554
+
t.Errorf("Expected title 'Integration Test Article', got %s", content.Title)
555
555
+
}
556
556
+
if mdPath == "" {
557
557
+
t.Error("Expected non-empty markdown path")
558
558
+
}
559
559
+
if htmlPath == "" {
560
560
+
t.Error("Expected non-empty HTML path")
561
561
+
}
562
562
+
563
563
+
// Check files exist
564
564
+
if _, err := os.Stat(mdPath); os.IsNotExist(err) {
565
565
+
t.Error("Expected markdown file to exist")
566
566
+
}
567
567
+
if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
568
568
+
t.Error("Expected HTML file to exist")
569
569
+
}
245
570
})
246
571
}