internal/articles/parser.go at main · desertthunder.dev/noteleaf

cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
noteleaf / internal / articles / parser.go
at main 609 lines 17 kB view raw
  1package articles
  2
  3import (
  4	"bufio"
  5	"embed"
  6	"fmt"
  7	"io"
  8	"net/http"
  9	"net/url"
 10	"os"
 11	"path/filepath"
 12	"regexp"
 13	"strings"
 14	"time"
 15
 16	"github.com/antchfx/htmlquery"
 17	"github.com/gomarkdown/markdown"
 18	"github.com/gomarkdown/markdown/html"
 19	"github.com/gomarkdown/markdown/parser"
 20	"github.com/stormlightlabs/noteleaf/internal/models"
 21	exhtml "golang.org/x/net/html"
 22)
 23
 24//go:embed rules/*.txt
 25var rulesFS embed.FS
 26
 27// ParsedContent represents the extracted content from a web page
 28type ParsedContent struct {
 29	Title            string
 30	Author           string
 31	Date             string
 32	Content          string
 33	URL              string
 34	Confidence       float64 // 0-1 scale, confidence in extraction quality
 35	ExtractionMethod string  // "xpath", "heuristic", "dual-validated", etc.
 36}
 37
 38// ParsingRule represents XPath rules for extracting content from a specific domain
 39type ParsingRule struct {
 40	Domain            string
 41	Title             string
 42	Author            string
 43	Date              string
 44	Body              string
 45	Strip             []string // XPath selectors for elements to remove
 46	StripIDsOrClasses []string
 47	TestURLs          []string
 48	Headers           map[string]string
 49	Prune             bool
 50	Tidy              bool
 51}
 52
 53// Parser interface defines methods for parsing articles from URLs
 54type Parser interface {
 55	// ParseURL extracts article content from a given URL
 56	ParseURL(url string) (*ParsedContent, error)
 57	// Convert HTML content directly to markdown using domain-specific rules
 58	Convert(htmlContent, domain, sourceURL string) (string, error)
 59	// GetSupportedDomains returns a list of domains that have parsing rules
 60	GetSupportedDomains() []string
 61	// SaveArticle saves the parsed content to filesystem and returns file paths
 62	SaveArticle(content *ParsedContent, storageDir string) (markdownPath, htmlPath string, err error)
 63}
 64
 65// ArticleParser implements the Parser interface
 66type ArticleParser struct {
 67	rules             map[string]*ParsingRule
 68	client            *http.Client
 69	heuristicExtract  *HeuristicExtractor
 70	metadataExtractor *MetadataExtractor
 71}
 72
 73// NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules
 74func NewArticleParser(client *http.Client) (*ArticleParser, error) {
 75	parser := &ArticleParser{
 76		rules:             make(map[string]*ParsingRule),
 77		client:            client,
 78		heuristicExtract:  NewHeuristicExtractor(),
 79		metadataExtractor: NewMetadataExtractor(),
 80	}
 81
 82	if err := parser.loadRules(); err != nil {
 83		return nil, fmt.Errorf("failed to load parsing rules: %w", err)
 84	}
 85
 86	return parser, nil
 87}
 88
 89// AddRule adds or replaces a parsing rule for a specific domain
 90func (p *ArticleParser) AddRule(domain string, rule *ParsingRule) {
 91	p.rules[domain] = rule
 92}
 93
 94// SetHTTPClient overrides the HTTP client used for fetching article content.
 95func (p *ArticleParser) SetHTTPClient(client *http.Client) {
 96	p.client = client
 97}
 98
 99func (p *ArticleParser) loadRules() error {
100	entries, err := rulesFS.ReadDir("rules")
101	if err != nil {
102		return fmt.Errorf("failed to read rules directory: %w", err)
103	}
104
105	for _, entry := range entries {
106		if !strings.HasSuffix(entry.Name(), ".txt") {
107			continue
108		}
109
110		domain := strings.TrimSuffix(entry.Name(), ".txt")
111
112		content, err := rulesFS.ReadFile(filepath.Join("rules", entry.Name()))
113		if err != nil {
114			return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err)
115		}
116
117		rule, err := p.parseRules(domain, string(content))
118		if err != nil {
119			return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err)
120		}
121
122		p.rules[domain] = rule
123	}
124
125	return nil
126}
127
128func (p *ArticleParser) parseRules(domain, content string) (*ParsingRule, error) {
129	rule := &ParsingRule{Domain: domain, Strip: []string{}}
130	scanner := bufio.NewScanner(strings.NewReader(content))
131	for scanner.Scan() {
132		line := strings.TrimSpace(scanner.Text())
133
134		if line == "" || strings.HasPrefix(line, "#") {
135			continue
136		}
137
138		parts := strings.SplitN(line, ":", 2)
139		if len(parts) != 2 {
140			continue
141		}
142
143		key := strings.TrimSpace(parts[0])
144		value := strings.TrimSpace(parts[1])
145
146		switch key {
147		case "title":
148			rule.Title = value
149		case "author":
150			rule.Author = value
151		case "date":
152			rule.Date = value
153		case "body":
154			rule.Body = value
155		case "strip":
156			rule.Strip = append(rule.Strip, value)
157		case "strip_id_or_class":
158			rule.StripIDsOrClasses = append(rule.StripIDsOrClasses, value)
159		case "prune":
160			rule.Prune = parseBool(value)
161		case "tidy":
162			rule.Tidy = parseBool(value)
163		case "test_url":
164			rule.TestURLs = append(rule.TestURLs, value)
165		default:
166			if strings.HasPrefix(key, "http_header(") && strings.HasSuffix(key, ")") {
167				headerName := strings.TrimSuffix(strings.TrimPrefix(key, "http_header("), ")")
168				if headerName != "" {
169					if rule.Headers == nil {
170						rule.Headers = make(map[string]string)
171					}
172					rule.Headers[http.CanonicalHeaderKey(headerName)] = value
173				}
174			}
175		}
176	}
177
178	if err := scanner.Err(); err != nil {
179		return nil, fmt.Errorf("error reading rule file: %w", err)
180	}
181
182	return rule, nil
183}
184
185func parseBool(value string) bool {
186	switch strings.ToLower(strings.TrimSpace(value)) {
187	case "1", "true", "yes", "on":
188		return true
189	default:
190		return false
191	}
192}
193
194func (p *ArticleParser) findRule(domain string) *ParsingRule {
195	for ruleDomain, rule := range p.rules {
196		if domain == ruleDomain || strings.HasSuffix(domain, ruleDomain) {
197			return rule
198		}
199	}
200	return nil
201}
202
203// ParseURL extracts article content from a given URL
204func (p *ArticleParser) ParseURL(s string) (*ParsedContent, error) {
205	parsedURL, err := url.Parse(s)
206	if err != nil {
207		return nil, fmt.Errorf("invalid URL: %w", err)
208	}
209
210	domain := parsedURL.Hostname()
211	rule := p.findRule(domain)
212	req, err := http.NewRequest(http.MethodGet, s, nil)
213
214	if err != nil {
215		return nil, fmt.Errorf("failed to create request: %w", err)
216	}
217
218	if rule != nil {
219		for header, value := range rule.Headers {
220			if value == "" {
221				continue
222			}
223			if req.Header.Get(header) == "" {
224				req.Header.Set(header, value)
225			}
226		}
227	}
228
229	resp, err := p.client.Do(req)
230	if err != nil {
231		return nil, fmt.Errorf("failed to fetch URL: %w", err)
232	}
233	defer resp.Body.Close()
234
235	if resp.StatusCode != http.StatusOK {
236		return nil, fmt.Errorf("HTTP error: %d", resp.StatusCode)
237	}
238
239	htmlBytes, err := io.ReadAll(resp.Body)
240	if err != nil {
241		return nil, fmt.Errorf("failed to read response body: %w", err)
242	}
243
244	return p.Parse(string(htmlBytes), domain, s)
245}
246
247// ParseHTML extracts article content from HTML string using domain-specific rules with heuristic fallback.
248// Implements dual validation: compares XPath results with heuristic extraction when rules exist.
249func (p *ArticleParser) Parse(htmlContent, domain, sourceURL string) (*ParsedContent, error) {
250	doc, err := htmlquery.Parse(strings.NewReader(htmlContent))
251	if err != nil {
252		return nil, fmt.Errorf("failed to parse HTML: %w", err)
253	}
254
255	rule := p.findRule(domain)
256
257	if rule == nil {
258		return p.parseWithHeuristics(doc, sourceURL)
259	}
260
261	content := &ParsedContent{
262		URL:              sourceURL,
263		ExtractionMethod: "xpath",
264		Confidence:       0.85,
265	}
266
267	if rule.Title != "" {
268		if titleNode := htmlquery.FindOne(doc, rule.Title); titleNode != nil {
269			content.Title = strings.TrimSpace(htmlquery.InnerText(titleNode))
270		}
271	}
272	if content.Title == "" {
273		content.Title = p.metadataExtractor.ExtractTitle(doc)
274	}
275
276	if rule.Author != "" {
277		if authorNode := htmlquery.FindOne(doc, rule.Author); authorNode != nil {
278			content.Author = strings.TrimSpace(htmlquery.InnerText(authorNode))
279		}
280	}
281	if content.Author == "" {
282		content.Author = p.metadataExtractor.ExtractAuthor(doc)
283	}
284
285	if rule.Date != "" {
286		if dateNode := htmlquery.FindOne(doc, rule.Date); dateNode != nil {
287			content.Date = strings.TrimSpace(htmlquery.InnerText(dateNode))
288		}
289	}
290	if content.Date == "" {
291		content.Date = p.metadataExtractor.ExtractPublishedDate(doc)
292	}
293
294	if rule.Body != "" {
295		bodyNode := htmlquery.FindOne(doc, rule.Body)
296		if bodyNode == nil {
297			return p.parseWithHeuristics(doc, sourceURL)
298		}
299
300		for _, stripXPath := range rule.Strip {
301			removeNodesByXPath(bodyNode, stripXPath)
302		}
303
304		for _, identifier := range rule.StripIDsOrClasses {
305			removeNodesByIdentifier(bodyNode, identifier)
306		}
307
308		removeDefaultNonContentNodes(bodyNode)
309
310		xpathContent := normalizeWhitespace(htmlquery.InnerText(bodyNode))
311
312		heuristicResult := p.heuristicExtract.CompareWithXPath(doc, bodyNode)
313		if heuristicResult != nil {
314			content.Content = heuristicResult.Content
315			if content.Content == "" {
316				content.Content = xpathContent
317			}
318			content.Confidence = heuristicResult.Confidence
319			content.ExtractionMethod = heuristicResult.ExtractionMethod
320		} else {
321			content.Content = xpathContent
322		}
323	}
324
325	if content.Title == "" {
326		return nil, fmt.Errorf("could not extract title from HTML")
327	}
328
329	return content, nil
330}
331
332// parseWithHeuristics performs heuristic-only extraction when no XPath rule exists.
333func (p *ArticleParser) parseWithHeuristics(doc *exhtml.Node, sourceURL string) (*ParsedContent, error) {
334	result := p.heuristicExtract.ExtractWithSemanticHTML(doc)
335	if result == nil {
336		result = &ExtractionResult{
337			ExtractionMethod: "heuristic-failed",
338			Confidence:       0.0,
339		}
340	}
341
342	metadata := p.metadataExtractor.ExtractMetadata(doc)
343	if metadata != nil {
344		if result.Title == "" {
345			result.Title = metadata.Title
346		}
347		if result.Author == "" {
348			result.Author = metadata.Author
349		}
350		if result.PublishedDate == "" {
351			result.PublishedDate = metadata.PublishedDate
352		}
353	}
354
355	content := &ParsedContent{
356		Title:            result.Title,
357		Author:           result.Author,
358		Date:             result.PublishedDate,
359		Content:          result.Content,
360		URL:              sourceURL,
361		Confidence:       result.Confidence,
362		ExtractionMethod: result.ExtractionMethod,
363	}
364
365	if content.Title == "" {
366		return nil, fmt.Errorf("could not extract title from HTML using heuristics")
367	}
368
369	if content.Confidence < 0.3 {
370		return nil, fmt.Errorf("heuristic extraction confidence too low (%.2f)", content.Confidence)
371	}
372
373	return content, nil
374}
375
376func removeNodesByXPath(root *exhtml.Node, xpath string) {
377	if root == nil {
378		return
379	}
380
381	xpath = strings.TrimSpace(xpath)
382	if xpath == "" {
383		return
384	}
385
386	nodes := htmlquery.Find(root, xpath)
387	for _, node := range nodes {
388		if node != nil && node.Parent != nil {
389			node.Parent.RemoveChild(node)
390		}
391	}
392}
393
394func removeNodesByIdentifier(root *exhtml.Node, identifier string) {
395	identifier = strings.TrimSpace(identifier)
396	if root == nil || identifier == "" {
397		return
398	}
399
400	idLiteral := buildXPathLiteral(identifier)
401	removeNodesByXPath(root, fmt.Sprintf(".//*[@id=%s]", idLiteral))
402
403	classLiteral := buildXPathLiteral(" " + identifier + " ")
404	removeNodesByXPath(root, fmt.Sprintf(".//*[contains(concat(' ', normalize-space(@class), ' '), %s)]", classLiteral))
405}
406
407func removeDefaultNonContentNodes(root *exhtml.Node) {
408	for _, xp := range []string{
409		".//script",
410		".//style",
411		".//noscript",
412	} {
413		removeNodesByXPath(root, xp)
414	}
415}
416
417func normalizeWhitespace(value string) string {
418	value = strings.ReplaceAll(value, "\u00a0", " ")
419	return strings.TrimSpace(value)
420}
421
422func buildXPathLiteral(value string) string {
423	if !strings.Contains(value, "'") {
424		return "'" + value + "'"
425	}
426
427	if !strings.Contains(value, "\"") {
428		return `"` + value + `"`
429	}
430
431	segments := strings.Split(value, "'")
432	var builder strings.Builder
433	builder.WriteString("concat(")
434
435	for i, segment := range segments {
436		if i > 0 {
437			builder.WriteString(", \"'\", ")
438		}
439		if segment == "" {
440			builder.WriteString("''")
441			continue
442		}
443		builder.WriteString("'")
444		builder.WriteString(segment)
445		builder.WriteString("'")
446	}
447
448	builder.WriteString(")")
449	return builder.String()
450}
451
452// Convert HTML content directly to markdown using domain-specific rules
453func (p *ArticleParser) Convert(htmlContent, domain, sourceURL string) (string, error) {
454	content, err := p.Parse(htmlContent, domain, sourceURL)
455	if err != nil {
456		return "", err
457	}
458
459	return p.createMarkdown(content), nil
460}
461
462// GetSupportedDomains returns a list of domains that have parsing rules
463func (p *ArticleParser) GetSupportedDomains() []string {
464	var domains []string
465	for domain := range p.rules {
466		domains = append(domains, domain)
467	}
468	return domains
469}
470
471// SaveArticle saves the parsed content to filesystem and returns file paths
472func (p *ArticleParser) SaveArticle(content *ParsedContent, dir string) (markdownPath, htmlPath string, err error) {
473	if err := os.MkdirAll(dir, 0755); err != nil {
474		return "", "", fmt.Errorf("failed to create storage directory: %w", err)
475	}
476
477	slug := p.slugify(content.Title)
478	if slug == "" {
479		slug = "article"
480	}
481
482	baseMarkdownPath := filepath.Join(dir, slug+".md")
483	baseHTMLPath := filepath.Join(dir, slug+".html")
484	markdownPath = baseMarkdownPath
485	htmlPath = baseHTMLPath
486
487	counter := 1
488	for {
489		if _, err := os.Stat(markdownPath); os.IsNotExist(err) {
490			if _, err := os.Stat(htmlPath); os.IsNotExist(err) {
491				break
492			}
493		}
494		markdownPath = filepath.Join(dir, fmt.Sprintf("%s_%d.md", slug, counter))
495		htmlPath = filepath.Join(dir, fmt.Sprintf("%s_%d.html", slug, counter))
496		counter++
497	}
498
499	markdownContent := p.createMarkdown(content)
500
501	if err := os.WriteFile(markdownPath, []byte(markdownContent), 0644); err != nil {
502		return "", "", fmt.Errorf("failed to write markdown file: %w", err)
503	}
504
505	htmlContent := p.createHTML(content, markdownContent)
506
507	if err := os.WriteFile(htmlPath, []byte(htmlContent), 0644); err != nil {
508		os.Remove(markdownPath)
509		return "", "", fmt.Errorf("failed to write HTML file: %w", err)
510	}
511
512	return markdownPath, htmlPath, nil
513}
514
515func (p *ArticleParser) slugify(title string) string {
516	slug := strings.ToLower(title)
517
518	reg := regexp.MustCompile(`[^a-z0-9]+`)
519	slug = reg.ReplaceAllString(slug, "-")
520
521	slug = strings.Trim(slug, "-")
522
523	if len(slug) > 100 {
524		slug = slug[:100]
525		slug = strings.Trim(slug, "-")
526	}
527
528	return slug
529}
530
531func (p *ArticleParser) createMarkdown(content *ParsedContent) string {
532	var builder strings.Builder
533
534	builder.WriteString(fmt.Sprintf("# %s\n\n", content.Title))
535
536	if content.Author != "" {
537		builder.WriteString(fmt.Sprintf("**Author:** %s\n\n", content.Author))
538	}
539
540	if content.Date != "" {
541		builder.WriteString(fmt.Sprintf("**Date:** %s\n\n", content.Date))
542	}
543
544	builder.WriteString(fmt.Sprintf("**Source:** %s\n\n", content.URL))
545	builder.WriteString(fmt.Sprintf("**Saved:** %s\n\n", time.Now().Format("2006-01-02 15:04:05")))
546
547	builder.WriteString("---\n\n")
548	builder.WriteString(content.Content)
549
550	return builder.String()
551}
552
553func (p *ArticleParser) createHTML(content *ParsedContent, markdownContent string) string {
554	extensions := parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock
555	mdParser := parser.NewWithExtensions(extensions)
556	doc := mdParser.Parse([]byte(markdownContent))
557
558	htmlFlags := html.CommonFlags | html.HrefTargetBlank
559	opts := html.RendererOptions{Flags: htmlFlags}
560	renderer := html.NewRenderer(opts)
561
562	htmlBody := markdown.Render(doc, renderer)
563
564	var builder strings.Builder
565	builder.WriteString("<!DOCTYPE html>\n")
566	builder.WriteString("<html>\n<head>\n")
567	builder.WriteString(fmt.Sprintf("  <title>%s</title>\n", content.Title))
568	builder.WriteString("  <meta charset=\"UTF-8\">\n")
569	builder.WriteString("  <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n")
570	builder.WriteString("  <style>\n")
571	builder.WriteString("    body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }\n")
572	builder.WriteString("    pre { background-color: #f4f4f4; padding: 10px; border-radius: 4px; overflow-x: auto; }\n")
573	builder.WriteString("    blockquote { border-left: 4px solid #ccc; padding-left: 16px; margin-left: 0; }\n")
574	builder.WriteString("  </style>\n")
575	builder.WriteString("</head>\n<body>\n")
576	builder.Write(htmlBody)
577	builder.WriteString("\n</body>\n</html>")
578
579	return builder.String()
580}
581
582// CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article]
583func CreateArticleFromURL(url, dir string) (*models.Article, error) {
584	parser, err := NewArticleParser(http.DefaultClient)
585	if err != nil {
586		return nil, fmt.Errorf("failed to create parser: %w", err)
587	}
588
589	content, err := parser.ParseURL(url)
590	if err != nil {
591		return nil, fmt.Errorf("failed to parse URL: %w", err)
592	}
593
594	mdPath, htmlPath, err := parser.SaveArticle(content, dir)
595	if err != nil {
596		return nil, fmt.Errorf("failed to save article: %w", err)
597	}
598
599	return &models.Article{
600		URL:          url,
601		Title:        content.Title,
602		Author:       content.Author,
603		Date:         content.Date,
604		MarkdownPath: mdPath,
605		HTMLPath:     htmlPath,
606		Created:      time.Now(),
607		Modified:     time.Now(),
608	}, nil
609}