cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm
leaflet
readability
golang
1package articles
2
3import (
4 "bufio"
5 "embed"
6 "fmt"
7 "io"
8 "net/http"
9 "net/url"
10 "os"
11 "path/filepath"
12 "regexp"
13 "strings"
14 "time"
15
16 "github.com/antchfx/htmlquery"
17 "github.com/gomarkdown/markdown"
18 "github.com/gomarkdown/markdown/html"
19 "github.com/gomarkdown/markdown/parser"
20 "github.com/stormlightlabs/noteleaf/internal/models"
21 exhtml "golang.org/x/net/html"
22)
23
24//go:embed rules/*.txt
25var rulesFS embed.FS
26
27// ParsedContent represents the extracted content from a web page
28type ParsedContent struct {
29 Title string
30 Author string
31 Date string
32 Content string
33 URL string
34 Confidence float64 // 0-1 scale, confidence in extraction quality
35 ExtractionMethod string // "xpath", "heuristic", "dual-validated", etc.
36}
37
38// ParsingRule represents XPath rules for extracting content from a specific domain
39type ParsingRule struct {
40 Domain string
41 Title string
42 Author string
43 Date string
44 Body string
45 Strip []string // XPath selectors for elements to remove
46 StripIDsOrClasses []string
47 TestURLs []string
48 Headers map[string]string
49 Prune bool
50 Tidy bool
51}
52
53// Parser interface defines methods for parsing articles from URLs
54type Parser interface {
55 // ParseURL extracts article content from a given URL
56 ParseURL(url string) (*ParsedContent, error)
57 // Convert HTML content directly to markdown using domain-specific rules
58 Convert(htmlContent, domain, sourceURL string) (string, error)
59 // GetSupportedDomains returns a list of domains that have parsing rules
60 GetSupportedDomains() []string
61 // SaveArticle saves the parsed content to filesystem and returns file paths
62 SaveArticle(content *ParsedContent, storageDir string) (markdownPath, htmlPath string, err error)
63}
64
65// ArticleParser implements the Parser interface
66type ArticleParser struct {
67 rules map[string]*ParsingRule
68 client *http.Client
69 heuristicExtract *HeuristicExtractor
70 metadataExtractor *MetadataExtractor
71}
72
73// NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules
74func NewArticleParser(client *http.Client) (*ArticleParser, error) {
75 parser := &ArticleParser{
76 rules: make(map[string]*ParsingRule),
77 client: client,
78 heuristicExtract: NewHeuristicExtractor(),
79 metadataExtractor: NewMetadataExtractor(),
80 }
81
82 if err := parser.loadRules(); err != nil {
83 return nil, fmt.Errorf("failed to load parsing rules: %w", err)
84 }
85
86 return parser, nil
87}
88
89// AddRule adds or replaces a parsing rule for a specific domain
90func (p *ArticleParser) AddRule(domain string, rule *ParsingRule) {
91 p.rules[domain] = rule
92}
93
94// SetHTTPClient overrides the HTTP client used for fetching article content.
95func (p *ArticleParser) SetHTTPClient(client *http.Client) {
96 p.client = client
97}
98
99func (p *ArticleParser) loadRules() error {
100 entries, err := rulesFS.ReadDir("rules")
101 if err != nil {
102 return fmt.Errorf("failed to read rules directory: %w", err)
103 }
104
105 for _, entry := range entries {
106 if !strings.HasSuffix(entry.Name(), ".txt") {
107 continue
108 }
109
110 domain := strings.TrimSuffix(entry.Name(), ".txt")
111
112 content, err := rulesFS.ReadFile(filepath.Join("rules", entry.Name()))
113 if err != nil {
114 return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err)
115 }
116
117 rule, err := p.parseRules(domain, string(content))
118 if err != nil {
119 return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err)
120 }
121
122 p.rules[domain] = rule
123 }
124
125 return nil
126}
127
128func (p *ArticleParser) parseRules(domain, content string) (*ParsingRule, error) {
129 rule := &ParsingRule{Domain: domain, Strip: []string{}}
130 scanner := bufio.NewScanner(strings.NewReader(content))
131 for scanner.Scan() {
132 line := strings.TrimSpace(scanner.Text())
133
134 if line == "" || strings.HasPrefix(line, "#") {
135 continue
136 }
137
138 parts := strings.SplitN(line, ":", 2)
139 if len(parts) != 2 {
140 continue
141 }
142
143 key := strings.TrimSpace(parts[0])
144 value := strings.TrimSpace(parts[1])
145
146 switch key {
147 case "title":
148 rule.Title = value
149 case "author":
150 rule.Author = value
151 case "date":
152 rule.Date = value
153 case "body":
154 rule.Body = value
155 case "strip":
156 rule.Strip = append(rule.Strip, value)
157 case "strip_id_or_class":
158 rule.StripIDsOrClasses = append(rule.StripIDsOrClasses, value)
159 case "prune":
160 rule.Prune = parseBool(value)
161 case "tidy":
162 rule.Tidy = parseBool(value)
163 case "test_url":
164 rule.TestURLs = append(rule.TestURLs, value)
165 default:
166 if strings.HasPrefix(key, "http_header(") && strings.HasSuffix(key, ")") {
167 headerName := strings.TrimSuffix(strings.TrimPrefix(key, "http_header("), ")")
168 if headerName != "" {
169 if rule.Headers == nil {
170 rule.Headers = make(map[string]string)
171 }
172 rule.Headers[http.CanonicalHeaderKey(headerName)] = value
173 }
174 }
175 }
176 }
177
178 if err := scanner.Err(); err != nil {
179 return nil, fmt.Errorf("error reading rule file: %w", err)
180 }
181
182 return rule, nil
183}
184
185func parseBool(value string) bool {
186 switch strings.ToLower(strings.TrimSpace(value)) {
187 case "1", "true", "yes", "on":
188 return true
189 default:
190 return false
191 }
192}
193
194func (p *ArticleParser) findRule(domain string) *ParsingRule {
195 for ruleDomain, rule := range p.rules {
196 if domain == ruleDomain || strings.HasSuffix(domain, ruleDomain) {
197 return rule
198 }
199 }
200 return nil
201}
202
203// ParseURL extracts article content from a given URL
204func (p *ArticleParser) ParseURL(s string) (*ParsedContent, error) {
205 parsedURL, err := url.Parse(s)
206 if err != nil {
207 return nil, fmt.Errorf("invalid URL: %w", err)
208 }
209
210 domain := parsedURL.Hostname()
211 rule := p.findRule(domain)
212 req, err := http.NewRequest(http.MethodGet, s, nil)
213
214 if err != nil {
215 return nil, fmt.Errorf("failed to create request: %w", err)
216 }
217
218 if rule != nil {
219 for header, value := range rule.Headers {
220 if value == "" {
221 continue
222 }
223 if req.Header.Get(header) == "" {
224 req.Header.Set(header, value)
225 }
226 }
227 }
228
229 resp, err := p.client.Do(req)
230 if err != nil {
231 return nil, fmt.Errorf("failed to fetch URL: %w", err)
232 }
233 defer resp.Body.Close()
234
235 if resp.StatusCode != http.StatusOK {
236 return nil, fmt.Errorf("HTTP error: %d", resp.StatusCode)
237 }
238
239 htmlBytes, err := io.ReadAll(resp.Body)
240 if err != nil {
241 return nil, fmt.Errorf("failed to read response body: %w", err)
242 }
243
244 return p.Parse(string(htmlBytes), domain, s)
245}
246
247// ParseHTML extracts article content from HTML string using domain-specific rules with heuristic fallback.
248// Implements dual validation: compares XPath results with heuristic extraction when rules exist.
249func (p *ArticleParser) Parse(htmlContent, domain, sourceURL string) (*ParsedContent, error) {
250 doc, err := htmlquery.Parse(strings.NewReader(htmlContent))
251 if err != nil {
252 return nil, fmt.Errorf("failed to parse HTML: %w", err)
253 }
254
255 rule := p.findRule(domain)
256
257 if rule == nil {
258 return p.parseWithHeuristics(doc, sourceURL)
259 }
260
261 content := &ParsedContent{
262 URL: sourceURL,
263 ExtractionMethod: "xpath",
264 Confidence: 0.85,
265 }
266
267 if rule.Title != "" {
268 if titleNode := htmlquery.FindOne(doc, rule.Title); titleNode != nil {
269 content.Title = strings.TrimSpace(htmlquery.InnerText(titleNode))
270 }
271 }
272 if content.Title == "" {
273 content.Title = p.metadataExtractor.ExtractTitle(doc)
274 }
275
276 if rule.Author != "" {
277 if authorNode := htmlquery.FindOne(doc, rule.Author); authorNode != nil {
278 content.Author = strings.TrimSpace(htmlquery.InnerText(authorNode))
279 }
280 }
281 if content.Author == "" {
282 content.Author = p.metadataExtractor.ExtractAuthor(doc)
283 }
284
285 if rule.Date != "" {
286 if dateNode := htmlquery.FindOne(doc, rule.Date); dateNode != nil {
287 content.Date = strings.TrimSpace(htmlquery.InnerText(dateNode))
288 }
289 }
290 if content.Date == "" {
291 content.Date = p.metadataExtractor.ExtractPublishedDate(doc)
292 }
293
294 if rule.Body != "" {
295 bodyNode := htmlquery.FindOne(doc, rule.Body)
296 if bodyNode == nil {
297 return p.parseWithHeuristics(doc, sourceURL)
298 }
299
300 for _, stripXPath := range rule.Strip {
301 removeNodesByXPath(bodyNode, stripXPath)
302 }
303
304 for _, identifier := range rule.StripIDsOrClasses {
305 removeNodesByIdentifier(bodyNode, identifier)
306 }
307
308 removeDefaultNonContentNodes(bodyNode)
309
310 xpathContent := normalizeWhitespace(htmlquery.InnerText(bodyNode))
311
312 heuristicResult := p.heuristicExtract.CompareWithXPath(doc, bodyNode)
313 if heuristicResult != nil {
314 content.Content = heuristicResult.Content
315 if content.Content == "" {
316 content.Content = xpathContent
317 }
318 content.Confidence = heuristicResult.Confidence
319 content.ExtractionMethod = heuristicResult.ExtractionMethod
320 } else {
321 content.Content = xpathContent
322 }
323 }
324
325 if content.Title == "" {
326 return nil, fmt.Errorf("could not extract title from HTML")
327 }
328
329 return content, nil
330}
331
332// parseWithHeuristics performs heuristic-only extraction when no XPath rule exists.
333func (p *ArticleParser) parseWithHeuristics(doc *exhtml.Node, sourceURL string) (*ParsedContent, error) {
334 result := p.heuristicExtract.ExtractWithSemanticHTML(doc)
335 if result == nil {
336 result = &ExtractionResult{
337 ExtractionMethod: "heuristic-failed",
338 Confidence: 0.0,
339 }
340 }
341
342 metadata := p.metadataExtractor.ExtractMetadata(doc)
343 if metadata != nil {
344 if result.Title == "" {
345 result.Title = metadata.Title
346 }
347 if result.Author == "" {
348 result.Author = metadata.Author
349 }
350 if result.PublishedDate == "" {
351 result.PublishedDate = metadata.PublishedDate
352 }
353 }
354
355 content := &ParsedContent{
356 Title: result.Title,
357 Author: result.Author,
358 Date: result.PublishedDate,
359 Content: result.Content,
360 URL: sourceURL,
361 Confidence: result.Confidence,
362 ExtractionMethod: result.ExtractionMethod,
363 }
364
365 if content.Title == "" {
366 return nil, fmt.Errorf("could not extract title from HTML using heuristics")
367 }
368
369 if content.Confidence < 0.3 {
370 return nil, fmt.Errorf("heuristic extraction confidence too low (%.2f)", content.Confidence)
371 }
372
373 return content, nil
374}
375
376func removeNodesByXPath(root *exhtml.Node, xpath string) {
377 if root == nil {
378 return
379 }
380
381 xpath = strings.TrimSpace(xpath)
382 if xpath == "" {
383 return
384 }
385
386 nodes := htmlquery.Find(root, xpath)
387 for _, node := range nodes {
388 if node != nil && node.Parent != nil {
389 node.Parent.RemoveChild(node)
390 }
391 }
392}
393
394func removeNodesByIdentifier(root *exhtml.Node, identifier string) {
395 identifier = strings.TrimSpace(identifier)
396 if root == nil || identifier == "" {
397 return
398 }
399
400 idLiteral := buildXPathLiteral(identifier)
401 removeNodesByXPath(root, fmt.Sprintf(".//*[@id=%s]", idLiteral))
402
403 classLiteral := buildXPathLiteral(" " + identifier + " ")
404 removeNodesByXPath(root, fmt.Sprintf(".//*[contains(concat(' ', normalize-space(@class), ' '), %s)]", classLiteral))
405}
406
407func removeDefaultNonContentNodes(root *exhtml.Node) {
408 for _, xp := range []string{
409 ".//script",
410 ".//style",
411 ".//noscript",
412 } {
413 removeNodesByXPath(root, xp)
414 }
415}
416
417func normalizeWhitespace(value string) string {
418 value = strings.ReplaceAll(value, "\u00a0", " ")
419 return strings.TrimSpace(value)
420}
421
422func buildXPathLiteral(value string) string {
423 if !strings.Contains(value, "'") {
424 return "'" + value + "'"
425 }
426
427 if !strings.Contains(value, "\"") {
428 return `"` + value + `"`
429 }
430
431 segments := strings.Split(value, "'")
432 var builder strings.Builder
433 builder.WriteString("concat(")
434
435 for i, segment := range segments {
436 if i > 0 {
437 builder.WriteString(", \"'\", ")
438 }
439 if segment == "" {
440 builder.WriteString("''")
441 continue
442 }
443 builder.WriteString("'")
444 builder.WriteString(segment)
445 builder.WriteString("'")
446 }
447
448 builder.WriteString(")")
449 return builder.String()
450}
451
452// Convert HTML content directly to markdown using domain-specific rules
453func (p *ArticleParser) Convert(htmlContent, domain, sourceURL string) (string, error) {
454 content, err := p.Parse(htmlContent, domain, sourceURL)
455 if err != nil {
456 return "", err
457 }
458
459 return p.createMarkdown(content), nil
460}
461
462// GetSupportedDomains returns a list of domains that have parsing rules
463func (p *ArticleParser) GetSupportedDomains() []string {
464 var domains []string
465 for domain := range p.rules {
466 domains = append(domains, domain)
467 }
468 return domains
469}
470
471// SaveArticle saves the parsed content to filesystem and returns file paths
472func (p *ArticleParser) SaveArticle(content *ParsedContent, dir string) (markdownPath, htmlPath string, err error) {
473 if err := os.MkdirAll(dir, 0755); err != nil {
474 return "", "", fmt.Errorf("failed to create storage directory: %w", err)
475 }
476
477 slug := p.slugify(content.Title)
478 if slug == "" {
479 slug = "article"
480 }
481
482 baseMarkdownPath := filepath.Join(dir, slug+".md")
483 baseHTMLPath := filepath.Join(dir, slug+".html")
484 markdownPath = baseMarkdownPath
485 htmlPath = baseHTMLPath
486
487 counter := 1
488 for {
489 if _, err := os.Stat(markdownPath); os.IsNotExist(err) {
490 if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
491 break
492 }
493 }
494 markdownPath = filepath.Join(dir, fmt.Sprintf("%s_%d.md", slug, counter))
495 htmlPath = filepath.Join(dir, fmt.Sprintf("%s_%d.html", slug, counter))
496 counter++
497 }
498
499 markdownContent := p.createMarkdown(content)
500
501 if err := os.WriteFile(markdownPath, []byte(markdownContent), 0644); err != nil {
502 return "", "", fmt.Errorf("failed to write markdown file: %w", err)
503 }
504
505 htmlContent := p.createHTML(content, markdownContent)
506
507 if err := os.WriteFile(htmlPath, []byte(htmlContent), 0644); err != nil {
508 os.Remove(markdownPath)
509 return "", "", fmt.Errorf("failed to write HTML file: %w", err)
510 }
511
512 return markdownPath, htmlPath, nil
513}
514
515func (p *ArticleParser) slugify(title string) string {
516 slug := strings.ToLower(title)
517
518 reg := regexp.MustCompile(`[^a-z0-9]+`)
519 slug = reg.ReplaceAllString(slug, "-")
520
521 slug = strings.Trim(slug, "-")
522
523 if len(slug) > 100 {
524 slug = slug[:100]
525 slug = strings.Trim(slug, "-")
526 }
527
528 return slug
529}
530
531func (p *ArticleParser) createMarkdown(content *ParsedContent) string {
532 var builder strings.Builder
533
534 builder.WriteString(fmt.Sprintf("# %s\n\n", content.Title))
535
536 if content.Author != "" {
537 builder.WriteString(fmt.Sprintf("**Author:** %s\n\n", content.Author))
538 }
539
540 if content.Date != "" {
541 builder.WriteString(fmt.Sprintf("**Date:** %s\n\n", content.Date))
542 }
543
544 builder.WriteString(fmt.Sprintf("**Source:** %s\n\n", content.URL))
545 builder.WriteString(fmt.Sprintf("**Saved:** %s\n\n", time.Now().Format("2006-01-02 15:04:05")))
546
547 builder.WriteString("---\n\n")
548 builder.WriteString(content.Content)
549
550 return builder.String()
551}
552
553func (p *ArticleParser) createHTML(content *ParsedContent, markdownContent string) string {
554 extensions := parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock
555 mdParser := parser.NewWithExtensions(extensions)
556 doc := mdParser.Parse([]byte(markdownContent))
557
558 htmlFlags := html.CommonFlags | html.HrefTargetBlank
559 opts := html.RendererOptions{Flags: htmlFlags}
560 renderer := html.NewRenderer(opts)
561
562 htmlBody := markdown.Render(doc, renderer)
563
564 var builder strings.Builder
565 builder.WriteString("<!DOCTYPE html>\n")
566 builder.WriteString("<html>\n<head>\n")
567 builder.WriteString(fmt.Sprintf(" <title>%s</title>\n", content.Title))
568 builder.WriteString(" <meta charset=\"UTF-8\">\n")
569 builder.WriteString(" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n")
570 builder.WriteString(" <style>\n")
571 builder.WriteString(" body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }\n")
572 builder.WriteString(" pre { background-color: #f4f4f4; padding: 10px; border-radius: 4px; overflow-x: auto; }\n")
573 builder.WriteString(" blockquote { border-left: 4px solid #ccc; padding-left: 16px; margin-left: 0; }\n")
574 builder.WriteString(" </style>\n")
575 builder.WriteString("</head>\n<body>\n")
576 builder.Write(htmlBody)
577 builder.WriteString("\n</body>\n</html>")
578
579 return builder.String()
580}
581
582// CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article]
583func CreateArticleFromURL(url, dir string) (*models.Article, error) {
584 parser, err := NewArticleParser(http.DefaultClient)
585 if err != nil {
586 return nil, fmt.Errorf("failed to create parser: %w", err)
587 }
588
589 content, err := parser.ParseURL(url)
590 if err != nil {
591 return nil, fmt.Errorf("failed to parse URL: %w", err)
592 }
593
594 mdPath, htmlPath, err := parser.SaveArticle(content, dir)
595 if err != nil {
596 return nil, fmt.Errorf("failed to save article: %w", err)
597 }
598
599 return &models.Article{
600 URL: url,
601 Title: content.Title,
602 Author: content.Author,
603 Date: content.Date,
604 MarkdownPath: mdPath,
605 HTMLPath: htmlPath,
606 Created: time.Now(),
607 Modified: time.Now(),
608 }, nil
609}