cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
at main 609 lines 17 kB view raw
1package articles 2 3import ( 4 "bufio" 5 "embed" 6 "fmt" 7 "io" 8 "net/http" 9 "net/url" 10 "os" 11 "path/filepath" 12 "regexp" 13 "strings" 14 "time" 15 16 "github.com/antchfx/htmlquery" 17 "github.com/gomarkdown/markdown" 18 "github.com/gomarkdown/markdown/html" 19 "github.com/gomarkdown/markdown/parser" 20 "github.com/stormlightlabs/noteleaf/internal/models" 21 exhtml "golang.org/x/net/html" 22) 23 24//go:embed rules/*.txt 25var rulesFS embed.FS 26 27// ParsedContent represents the extracted content from a web page 28type ParsedContent struct { 29 Title string 30 Author string 31 Date string 32 Content string 33 URL string 34 Confidence float64 // 0-1 scale, confidence in extraction quality 35 ExtractionMethod string // "xpath", "heuristic", "dual-validated", etc. 36} 37 38// ParsingRule represents XPath rules for extracting content from a specific domain 39type ParsingRule struct { 40 Domain string 41 Title string 42 Author string 43 Date string 44 Body string 45 Strip []string // XPath selectors for elements to remove 46 StripIDsOrClasses []string 47 TestURLs []string 48 Headers map[string]string 49 Prune bool 50 Tidy bool 51} 52 53// Parser interface defines methods for parsing articles from URLs 54type Parser interface { 55 // ParseURL extracts article content from a given URL 56 ParseURL(url string) (*ParsedContent, error) 57 // Convert HTML content directly to markdown using domain-specific rules 58 Convert(htmlContent, domain, sourceURL string) (string, error) 59 // GetSupportedDomains returns a list of domains that have parsing rules 60 GetSupportedDomains() []string 61 // SaveArticle saves the parsed content to filesystem and returns file paths 62 SaveArticle(content *ParsedContent, storageDir string) (markdownPath, htmlPath string, err error) 63} 64 65// ArticleParser implements the Parser interface 66type ArticleParser struct { 67 rules map[string]*ParsingRule 68 client *http.Client 69 heuristicExtract *HeuristicExtractor 70 metadataExtractor *MetadataExtractor 71} 72 73// NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules 74func NewArticleParser(client *http.Client) (*ArticleParser, error) { 75 parser := &ArticleParser{ 76 rules: make(map[string]*ParsingRule), 77 client: client, 78 heuristicExtract: NewHeuristicExtractor(), 79 metadataExtractor: NewMetadataExtractor(), 80 } 81 82 if err := parser.loadRules(); err != nil { 83 return nil, fmt.Errorf("failed to load parsing rules: %w", err) 84 } 85 86 return parser, nil 87} 88 89// AddRule adds or replaces a parsing rule for a specific domain 90func (p *ArticleParser) AddRule(domain string, rule *ParsingRule) { 91 p.rules[domain] = rule 92} 93 94// SetHTTPClient overrides the HTTP client used for fetching article content. 95func (p *ArticleParser) SetHTTPClient(client *http.Client) { 96 p.client = client 97} 98 99func (p *ArticleParser) loadRules() error { 100 entries, err := rulesFS.ReadDir("rules") 101 if err != nil { 102 return fmt.Errorf("failed to read rules directory: %w", err) 103 } 104 105 for _, entry := range entries { 106 if !strings.HasSuffix(entry.Name(), ".txt") { 107 continue 108 } 109 110 domain := strings.TrimSuffix(entry.Name(), ".txt") 111 112 content, err := rulesFS.ReadFile(filepath.Join("rules", entry.Name())) 113 if err != nil { 114 return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err) 115 } 116 117 rule, err := p.parseRules(domain, string(content)) 118 if err != nil { 119 return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err) 120 } 121 122 p.rules[domain] = rule 123 } 124 125 return nil 126} 127 128func (p *ArticleParser) parseRules(domain, content string) (*ParsingRule, error) { 129 rule := &ParsingRule{Domain: domain, Strip: []string{}} 130 scanner := bufio.NewScanner(strings.NewReader(content)) 131 for scanner.Scan() { 132 line := strings.TrimSpace(scanner.Text()) 133 134 if line == "" || strings.HasPrefix(line, "#") { 135 continue 136 } 137 138 parts := strings.SplitN(line, ":", 2) 139 if len(parts) != 2 { 140 continue 141 } 142 143 key := strings.TrimSpace(parts[0]) 144 value := strings.TrimSpace(parts[1]) 145 146 switch key { 147 case "title": 148 rule.Title = value 149 case "author": 150 rule.Author = value 151 case "date": 152 rule.Date = value 153 case "body": 154 rule.Body = value 155 case "strip": 156 rule.Strip = append(rule.Strip, value) 157 case "strip_id_or_class": 158 rule.StripIDsOrClasses = append(rule.StripIDsOrClasses, value) 159 case "prune": 160 rule.Prune = parseBool(value) 161 case "tidy": 162 rule.Tidy = parseBool(value) 163 case "test_url": 164 rule.TestURLs = append(rule.TestURLs, value) 165 default: 166 if strings.HasPrefix(key, "http_header(") && strings.HasSuffix(key, ")") { 167 headerName := strings.TrimSuffix(strings.TrimPrefix(key, "http_header("), ")") 168 if headerName != "" { 169 if rule.Headers == nil { 170 rule.Headers = make(map[string]string) 171 } 172 rule.Headers[http.CanonicalHeaderKey(headerName)] = value 173 } 174 } 175 } 176 } 177 178 if err := scanner.Err(); err != nil { 179 return nil, fmt.Errorf("error reading rule file: %w", err) 180 } 181 182 return rule, nil 183} 184 185func parseBool(value string) bool { 186 switch strings.ToLower(strings.TrimSpace(value)) { 187 case "1", "true", "yes", "on": 188 return true 189 default: 190 return false 191 } 192} 193 194func (p *ArticleParser) findRule(domain string) *ParsingRule { 195 for ruleDomain, rule := range p.rules { 196 if domain == ruleDomain || strings.HasSuffix(domain, ruleDomain) { 197 return rule 198 } 199 } 200 return nil 201} 202 203// ParseURL extracts article content from a given URL 204func (p *ArticleParser) ParseURL(s string) (*ParsedContent, error) { 205 parsedURL, err := url.Parse(s) 206 if err != nil { 207 return nil, fmt.Errorf("invalid URL: %w", err) 208 } 209 210 domain := parsedURL.Hostname() 211 rule := p.findRule(domain) 212 req, err := http.NewRequest(http.MethodGet, s, nil) 213 214 if err != nil { 215 return nil, fmt.Errorf("failed to create request: %w", err) 216 } 217 218 if rule != nil { 219 for header, value := range rule.Headers { 220 if value == "" { 221 continue 222 } 223 if req.Header.Get(header) == "" { 224 req.Header.Set(header, value) 225 } 226 } 227 } 228 229 resp, err := p.client.Do(req) 230 if err != nil { 231 return nil, fmt.Errorf("failed to fetch URL: %w", err) 232 } 233 defer resp.Body.Close() 234 235 if resp.StatusCode != http.StatusOK { 236 return nil, fmt.Errorf("HTTP error: %d", resp.StatusCode) 237 } 238 239 htmlBytes, err := io.ReadAll(resp.Body) 240 if err != nil { 241 return nil, fmt.Errorf("failed to read response body: %w", err) 242 } 243 244 return p.Parse(string(htmlBytes), domain, s) 245} 246 247// ParseHTML extracts article content from HTML string using domain-specific rules with heuristic fallback. 248// Implements dual validation: compares XPath results with heuristic extraction when rules exist. 249func (p *ArticleParser) Parse(htmlContent, domain, sourceURL string) (*ParsedContent, error) { 250 doc, err := htmlquery.Parse(strings.NewReader(htmlContent)) 251 if err != nil { 252 return nil, fmt.Errorf("failed to parse HTML: %w", err) 253 } 254 255 rule := p.findRule(domain) 256 257 if rule == nil { 258 return p.parseWithHeuristics(doc, sourceURL) 259 } 260 261 content := &ParsedContent{ 262 URL: sourceURL, 263 ExtractionMethod: "xpath", 264 Confidence: 0.85, 265 } 266 267 if rule.Title != "" { 268 if titleNode := htmlquery.FindOne(doc, rule.Title); titleNode != nil { 269 content.Title = strings.TrimSpace(htmlquery.InnerText(titleNode)) 270 } 271 } 272 if content.Title == "" { 273 content.Title = p.metadataExtractor.ExtractTitle(doc) 274 } 275 276 if rule.Author != "" { 277 if authorNode := htmlquery.FindOne(doc, rule.Author); authorNode != nil { 278 content.Author = strings.TrimSpace(htmlquery.InnerText(authorNode)) 279 } 280 } 281 if content.Author == "" { 282 content.Author = p.metadataExtractor.ExtractAuthor(doc) 283 } 284 285 if rule.Date != "" { 286 if dateNode := htmlquery.FindOne(doc, rule.Date); dateNode != nil { 287 content.Date = strings.TrimSpace(htmlquery.InnerText(dateNode)) 288 } 289 } 290 if content.Date == "" { 291 content.Date = p.metadataExtractor.ExtractPublishedDate(doc) 292 } 293 294 if rule.Body != "" { 295 bodyNode := htmlquery.FindOne(doc, rule.Body) 296 if bodyNode == nil { 297 return p.parseWithHeuristics(doc, sourceURL) 298 } 299 300 for _, stripXPath := range rule.Strip { 301 removeNodesByXPath(bodyNode, stripXPath) 302 } 303 304 for _, identifier := range rule.StripIDsOrClasses { 305 removeNodesByIdentifier(bodyNode, identifier) 306 } 307 308 removeDefaultNonContentNodes(bodyNode) 309 310 xpathContent := normalizeWhitespace(htmlquery.InnerText(bodyNode)) 311 312 heuristicResult := p.heuristicExtract.CompareWithXPath(doc, bodyNode) 313 if heuristicResult != nil { 314 content.Content = heuristicResult.Content 315 if content.Content == "" { 316 content.Content = xpathContent 317 } 318 content.Confidence = heuristicResult.Confidence 319 content.ExtractionMethod = heuristicResult.ExtractionMethod 320 } else { 321 content.Content = xpathContent 322 } 323 } 324 325 if content.Title == "" { 326 return nil, fmt.Errorf("could not extract title from HTML") 327 } 328 329 return content, nil 330} 331 332// parseWithHeuristics performs heuristic-only extraction when no XPath rule exists. 333func (p *ArticleParser) parseWithHeuristics(doc *exhtml.Node, sourceURL string) (*ParsedContent, error) { 334 result := p.heuristicExtract.ExtractWithSemanticHTML(doc) 335 if result == nil { 336 result = &ExtractionResult{ 337 ExtractionMethod: "heuristic-failed", 338 Confidence: 0.0, 339 } 340 } 341 342 metadata := p.metadataExtractor.ExtractMetadata(doc) 343 if metadata != nil { 344 if result.Title == "" { 345 result.Title = metadata.Title 346 } 347 if result.Author == "" { 348 result.Author = metadata.Author 349 } 350 if result.PublishedDate == "" { 351 result.PublishedDate = metadata.PublishedDate 352 } 353 } 354 355 content := &ParsedContent{ 356 Title: result.Title, 357 Author: result.Author, 358 Date: result.PublishedDate, 359 Content: result.Content, 360 URL: sourceURL, 361 Confidence: result.Confidence, 362 ExtractionMethod: result.ExtractionMethod, 363 } 364 365 if content.Title == "" { 366 return nil, fmt.Errorf("could not extract title from HTML using heuristics") 367 } 368 369 if content.Confidence < 0.3 { 370 return nil, fmt.Errorf("heuristic extraction confidence too low (%.2f)", content.Confidence) 371 } 372 373 return content, nil 374} 375 376func removeNodesByXPath(root *exhtml.Node, xpath string) { 377 if root == nil { 378 return 379 } 380 381 xpath = strings.TrimSpace(xpath) 382 if xpath == "" { 383 return 384 } 385 386 nodes := htmlquery.Find(root, xpath) 387 for _, node := range nodes { 388 if node != nil && node.Parent != nil { 389 node.Parent.RemoveChild(node) 390 } 391 } 392} 393 394func removeNodesByIdentifier(root *exhtml.Node, identifier string) { 395 identifier = strings.TrimSpace(identifier) 396 if root == nil || identifier == "" { 397 return 398 } 399 400 idLiteral := buildXPathLiteral(identifier) 401 removeNodesByXPath(root, fmt.Sprintf(".//*[@id=%s]", idLiteral)) 402 403 classLiteral := buildXPathLiteral(" " + identifier + " ") 404 removeNodesByXPath(root, fmt.Sprintf(".//*[contains(concat(' ', normalize-space(@class), ' '), %s)]", classLiteral)) 405} 406 407func removeDefaultNonContentNodes(root *exhtml.Node) { 408 for _, xp := range []string{ 409 ".//script", 410 ".//style", 411 ".//noscript", 412 } { 413 removeNodesByXPath(root, xp) 414 } 415} 416 417func normalizeWhitespace(value string) string { 418 value = strings.ReplaceAll(value, "\u00a0", " ") 419 return strings.TrimSpace(value) 420} 421 422func buildXPathLiteral(value string) string { 423 if !strings.Contains(value, "'") { 424 return "'" + value + "'" 425 } 426 427 if !strings.Contains(value, "\"") { 428 return `"` + value + `"` 429 } 430 431 segments := strings.Split(value, "'") 432 var builder strings.Builder 433 builder.WriteString("concat(") 434 435 for i, segment := range segments { 436 if i > 0 { 437 builder.WriteString(", \"'\", ") 438 } 439 if segment == "" { 440 builder.WriteString("''") 441 continue 442 } 443 builder.WriteString("'") 444 builder.WriteString(segment) 445 builder.WriteString("'") 446 } 447 448 builder.WriteString(")") 449 return builder.String() 450} 451 452// Convert HTML content directly to markdown using domain-specific rules 453func (p *ArticleParser) Convert(htmlContent, domain, sourceURL string) (string, error) { 454 content, err := p.Parse(htmlContent, domain, sourceURL) 455 if err != nil { 456 return "", err 457 } 458 459 return p.createMarkdown(content), nil 460} 461 462// GetSupportedDomains returns a list of domains that have parsing rules 463func (p *ArticleParser) GetSupportedDomains() []string { 464 var domains []string 465 for domain := range p.rules { 466 domains = append(domains, domain) 467 } 468 return domains 469} 470 471// SaveArticle saves the parsed content to filesystem and returns file paths 472func (p *ArticleParser) SaveArticle(content *ParsedContent, dir string) (markdownPath, htmlPath string, err error) { 473 if err := os.MkdirAll(dir, 0755); err != nil { 474 return "", "", fmt.Errorf("failed to create storage directory: %w", err) 475 } 476 477 slug := p.slugify(content.Title) 478 if slug == "" { 479 slug = "article" 480 } 481 482 baseMarkdownPath := filepath.Join(dir, slug+".md") 483 baseHTMLPath := filepath.Join(dir, slug+".html") 484 markdownPath = baseMarkdownPath 485 htmlPath = baseHTMLPath 486 487 counter := 1 488 for { 489 if _, err := os.Stat(markdownPath); os.IsNotExist(err) { 490 if _, err := os.Stat(htmlPath); os.IsNotExist(err) { 491 break 492 } 493 } 494 markdownPath = filepath.Join(dir, fmt.Sprintf("%s_%d.md", slug, counter)) 495 htmlPath = filepath.Join(dir, fmt.Sprintf("%s_%d.html", slug, counter)) 496 counter++ 497 } 498 499 markdownContent := p.createMarkdown(content) 500 501 if err := os.WriteFile(markdownPath, []byte(markdownContent), 0644); err != nil { 502 return "", "", fmt.Errorf("failed to write markdown file: %w", err) 503 } 504 505 htmlContent := p.createHTML(content, markdownContent) 506 507 if err := os.WriteFile(htmlPath, []byte(htmlContent), 0644); err != nil { 508 os.Remove(markdownPath) 509 return "", "", fmt.Errorf("failed to write HTML file: %w", err) 510 } 511 512 return markdownPath, htmlPath, nil 513} 514 515func (p *ArticleParser) slugify(title string) string { 516 slug := strings.ToLower(title) 517 518 reg := regexp.MustCompile(`[^a-z0-9]+`) 519 slug = reg.ReplaceAllString(slug, "-") 520 521 slug = strings.Trim(slug, "-") 522 523 if len(slug) > 100 { 524 slug = slug[:100] 525 slug = strings.Trim(slug, "-") 526 } 527 528 return slug 529} 530 531func (p *ArticleParser) createMarkdown(content *ParsedContent) string { 532 var builder strings.Builder 533 534 builder.WriteString(fmt.Sprintf("# %s\n\n", content.Title)) 535 536 if content.Author != "" { 537 builder.WriteString(fmt.Sprintf("**Author:** %s\n\n", content.Author)) 538 } 539 540 if content.Date != "" { 541 builder.WriteString(fmt.Sprintf("**Date:** %s\n\n", content.Date)) 542 } 543 544 builder.WriteString(fmt.Sprintf("**Source:** %s\n\n", content.URL)) 545 builder.WriteString(fmt.Sprintf("**Saved:** %s\n\n", time.Now().Format("2006-01-02 15:04:05"))) 546 547 builder.WriteString("---\n\n") 548 builder.WriteString(content.Content) 549 550 return builder.String() 551} 552 553func (p *ArticleParser) createHTML(content *ParsedContent, markdownContent string) string { 554 extensions := parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock 555 mdParser := parser.NewWithExtensions(extensions) 556 doc := mdParser.Parse([]byte(markdownContent)) 557 558 htmlFlags := html.CommonFlags | html.HrefTargetBlank 559 opts := html.RendererOptions{Flags: htmlFlags} 560 renderer := html.NewRenderer(opts) 561 562 htmlBody := markdown.Render(doc, renderer) 563 564 var builder strings.Builder 565 builder.WriteString("<!DOCTYPE html>\n") 566 builder.WriteString("<html>\n<head>\n") 567 builder.WriteString(fmt.Sprintf(" <title>%s</title>\n", content.Title)) 568 builder.WriteString(" <meta charset=\"UTF-8\">\n") 569 builder.WriteString(" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n") 570 builder.WriteString(" <style>\n") 571 builder.WriteString(" body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }\n") 572 builder.WriteString(" pre { background-color: #f4f4f4; padding: 10px; border-radius: 4px; overflow-x: auto; }\n") 573 builder.WriteString(" blockquote { border-left: 4px solid #ccc; padding-left: 16px; margin-left: 0; }\n") 574 builder.WriteString(" </style>\n") 575 builder.WriteString("</head>\n<body>\n") 576 builder.Write(htmlBody) 577 builder.WriteString("\n</body>\n</html>") 578 579 return builder.String() 580} 581 582// CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article] 583func CreateArticleFromURL(url, dir string) (*models.Article, error) { 584 parser, err := NewArticleParser(http.DefaultClient) 585 if err != nil { 586 return nil, fmt.Errorf("failed to create parser: %w", err) 587 } 588 589 content, err := parser.ParseURL(url) 590 if err != nil { 591 return nil, fmt.Errorf("failed to parse URL: %w", err) 592 } 593 594 mdPath, htmlPath, err := parser.SaveArticle(content, dir) 595 if err != nil { 596 return nil, fmt.Errorf("failed to save article: %w", err) 597 } 598 599 return &models.Article{ 600 URL: url, 601 Title: content.Title, 602 Author: content.Author, 603 Date: content.Date, 604 MarkdownPath: mdPath, 605 HTMLPath: htmlPath, 606 Created: time.Now(), 607 Modified: time.Now(), 608 }, nil 609}