package articles import ( "bufio" "embed" "fmt" "io" "net/http" "net/url" "os" "path/filepath" "regexp" "strings" "time" "github.com/antchfx/htmlquery" "github.com/gomarkdown/markdown" "github.com/gomarkdown/markdown/html" "github.com/gomarkdown/markdown/parser" "github.com/stormlightlabs/noteleaf/internal/models" exhtml "golang.org/x/net/html" ) //go:embed rules/*.txt var rulesFS embed.FS // ParsedContent represents the extracted content from a web page type ParsedContent struct { Title string Author string Date string Content string URL string Confidence float64 // 0-1 scale, confidence in extraction quality ExtractionMethod string // "xpath", "heuristic", "dual-validated", etc. } // ParsingRule represents XPath rules for extracting content from a specific domain type ParsingRule struct { Domain string Title string Author string Date string Body string Strip []string // XPath selectors for elements to remove StripIDsOrClasses []string TestURLs []string Headers map[string]string Prune bool Tidy bool } // Parser interface defines methods for parsing articles from URLs type Parser interface { // ParseURL extracts article content from a given URL ParseURL(url string) (*ParsedContent, error) // Convert HTML content directly to markdown using domain-specific rules Convert(htmlContent, domain, sourceURL string) (string, error) // GetSupportedDomains returns a list of domains that have parsing rules GetSupportedDomains() []string // SaveArticle saves the parsed content to filesystem and returns file paths SaveArticle(content *ParsedContent, storageDir string) (markdownPath, htmlPath string, err error) } // ArticleParser implements the Parser interface type ArticleParser struct { rules map[string]*ParsingRule client *http.Client heuristicExtract *HeuristicExtractor metadataExtractor *MetadataExtractor } // NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules func NewArticleParser(client *http.Client) (*ArticleParser, error) { parser := &ArticleParser{ rules: make(map[string]*ParsingRule), client: client, heuristicExtract: NewHeuristicExtractor(), metadataExtractor: NewMetadataExtractor(), } if err := parser.loadRules(); err != nil { return nil, fmt.Errorf("failed to load parsing rules: %w", err) } return parser, nil } // AddRule adds or replaces a parsing rule for a specific domain func (p *ArticleParser) AddRule(domain string, rule *ParsingRule) { p.rules[domain] = rule } // SetHTTPClient overrides the HTTP client used for fetching article content. func (p *ArticleParser) SetHTTPClient(client *http.Client) { p.client = client } func (p *ArticleParser) loadRules() error { entries, err := rulesFS.ReadDir("rules") if err != nil { return fmt.Errorf("failed to read rules directory: %w", err) } for _, entry := range entries { if !strings.HasSuffix(entry.Name(), ".txt") { continue } domain := strings.TrimSuffix(entry.Name(), ".txt") content, err := rulesFS.ReadFile(filepath.Join("rules", entry.Name())) if err != nil { return fmt.Errorf("failed to read rule file %s: %w", entry.Name(), err) } rule, err := p.parseRules(domain, string(content)) if err != nil { return fmt.Errorf("failed to parse rule file %s: %w", entry.Name(), err) } p.rules[domain] = rule } return nil } func (p *ArticleParser) parseRules(domain, content string) (*ParsingRule, error) { rule := &ParsingRule{Domain: domain, Strip: []string{}} scanner := bufio.NewScanner(strings.NewReader(content)) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if line == "" || strings.HasPrefix(line, "#") { continue } parts := strings.SplitN(line, ":", 2) if len(parts) != 2 { continue } key := strings.TrimSpace(parts[0]) value := strings.TrimSpace(parts[1]) switch key { case "title": rule.Title = value case "author": rule.Author = value case "date": rule.Date = value case "body": rule.Body = value case "strip": rule.Strip = append(rule.Strip, value) case "strip_id_or_class": rule.StripIDsOrClasses = append(rule.StripIDsOrClasses, value) case "prune": rule.Prune = parseBool(value) case "tidy": rule.Tidy = parseBool(value) case "test_url": rule.TestURLs = append(rule.TestURLs, value) default: if strings.HasPrefix(key, "http_header(") && strings.HasSuffix(key, ")") { headerName := strings.TrimSuffix(strings.TrimPrefix(key, "http_header("), ")") if headerName != "" { if rule.Headers == nil { rule.Headers = make(map[string]string) } rule.Headers[http.CanonicalHeaderKey(headerName)] = value } } } } if err := scanner.Err(); err != nil { return nil, fmt.Errorf("error reading rule file: %w", err) } return rule, nil } func parseBool(value string) bool { switch strings.ToLower(strings.TrimSpace(value)) { case "1", "true", "yes", "on": return true default: return false } } func (p *ArticleParser) findRule(domain string) *ParsingRule { for ruleDomain, rule := range p.rules { if domain == ruleDomain || strings.HasSuffix(domain, ruleDomain) { return rule } } return nil } // ParseURL extracts article content from a given URL func (p *ArticleParser) ParseURL(s string) (*ParsedContent, error) { parsedURL, err := url.Parse(s) if err != nil { return nil, fmt.Errorf("invalid URL: %w", err) } domain := parsedURL.Hostname() rule := p.findRule(domain) req, err := http.NewRequest(http.MethodGet, s, nil) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } if rule != nil { for header, value := range rule.Headers { if value == "" { continue } if req.Header.Get(header) == "" { req.Header.Set(header, value) } } } resp, err := p.client.Do(req) if err != nil { return nil, fmt.Errorf("failed to fetch URL: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("HTTP error: %d", resp.StatusCode) } htmlBytes, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read response body: %w", err) } return p.Parse(string(htmlBytes), domain, s) } // ParseHTML extracts article content from HTML string using domain-specific rules with heuristic fallback. // Implements dual validation: compares XPath results with heuristic extraction when rules exist. func (p *ArticleParser) Parse(htmlContent, domain, sourceURL string) (*ParsedContent, error) { doc, err := htmlquery.Parse(strings.NewReader(htmlContent)) if err != nil { return nil, fmt.Errorf("failed to parse HTML: %w", err) } rule := p.findRule(domain) if rule == nil { return p.parseWithHeuristics(doc, sourceURL) } content := &ParsedContent{ URL: sourceURL, ExtractionMethod: "xpath", Confidence: 0.85, } if rule.Title != "" { if titleNode := htmlquery.FindOne(doc, rule.Title); titleNode != nil { content.Title = strings.TrimSpace(htmlquery.InnerText(titleNode)) } } if content.Title == "" { content.Title = p.metadataExtractor.ExtractTitle(doc) } if rule.Author != "" { if authorNode := htmlquery.FindOne(doc, rule.Author); authorNode != nil { content.Author = strings.TrimSpace(htmlquery.InnerText(authorNode)) } } if content.Author == "" { content.Author = p.metadataExtractor.ExtractAuthor(doc) } if rule.Date != "" { if dateNode := htmlquery.FindOne(doc, rule.Date); dateNode != nil { content.Date = strings.TrimSpace(htmlquery.InnerText(dateNode)) } } if content.Date == "" { content.Date = p.metadataExtractor.ExtractPublishedDate(doc) } if rule.Body != "" { bodyNode := htmlquery.FindOne(doc, rule.Body) if bodyNode == nil { return p.parseWithHeuristics(doc, sourceURL) } for _, stripXPath := range rule.Strip { removeNodesByXPath(bodyNode, stripXPath) } for _, identifier := range rule.StripIDsOrClasses { removeNodesByIdentifier(bodyNode, identifier) } removeDefaultNonContentNodes(bodyNode) xpathContent := normalizeWhitespace(htmlquery.InnerText(bodyNode)) heuristicResult := p.heuristicExtract.CompareWithXPath(doc, bodyNode) if heuristicResult != nil { content.Content = heuristicResult.Content if content.Content == "" { content.Content = xpathContent } content.Confidence = heuristicResult.Confidence content.ExtractionMethod = heuristicResult.ExtractionMethod } else { content.Content = xpathContent } } if content.Title == "" { return nil, fmt.Errorf("could not extract title from HTML") } return content, nil } // parseWithHeuristics performs heuristic-only extraction when no XPath rule exists. func (p *ArticleParser) parseWithHeuristics(doc *exhtml.Node, sourceURL string) (*ParsedContent, error) { result := p.heuristicExtract.ExtractWithSemanticHTML(doc) if result == nil { result = &ExtractionResult{ ExtractionMethod: "heuristic-failed", Confidence: 0.0, } } metadata := p.metadataExtractor.ExtractMetadata(doc) if metadata != nil { if result.Title == "" { result.Title = metadata.Title } if result.Author == "" { result.Author = metadata.Author } if result.PublishedDate == "" { result.PublishedDate = metadata.PublishedDate } } content := &ParsedContent{ Title: result.Title, Author: result.Author, Date: result.PublishedDate, Content: result.Content, URL: sourceURL, Confidence: result.Confidence, ExtractionMethod: result.ExtractionMethod, } if content.Title == "" { return nil, fmt.Errorf("could not extract title from HTML using heuristics") } if content.Confidence < 0.3 { return nil, fmt.Errorf("heuristic extraction confidence too low (%.2f)", content.Confidence) } return content, nil } func removeNodesByXPath(root *exhtml.Node, xpath string) { if root == nil { return } xpath = strings.TrimSpace(xpath) if xpath == "" { return } nodes := htmlquery.Find(root, xpath) for _, node := range nodes { if node != nil && node.Parent != nil { node.Parent.RemoveChild(node) } } } func removeNodesByIdentifier(root *exhtml.Node, identifier string) { identifier = strings.TrimSpace(identifier) if root == nil || identifier == "" { return } idLiteral := buildXPathLiteral(identifier) removeNodesByXPath(root, fmt.Sprintf(".//*[@id=%s]", idLiteral)) classLiteral := buildXPathLiteral(" " + identifier + " ") removeNodesByXPath(root, fmt.Sprintf(".//*[contains(concat(' ', normalize-space(@class), ' '), %s)]", classLiteral)) } func removeDefaultNonContentNodes(root *exhtml.Node) { for _, xp := range []string{ ".//script", ".//style", ".//noscript", } { removeNodesByXPath(root, xp) } } func normalizeWhitespace(value string) string { value = strings.ReplaceAll(value, "\u00a0", " ") return strings.TrimSpace(value) } func buildXPathLiteral(value string) string { if !strings.Contains(value, "'") { return "'" + value + "'" } if !strings.Contains(value, "\"") { return `"` + value + `"` } segments := strings.Split(value, "'") var builder strings.Builder builder.WriteString("concat(") for i, segment := range segments { if i > 0 { builder.WriteString(", \"'\", ") } if segment == "" { builder.WriteString("''") continue } builder.WriteString("'") builder.WriteString(segment) builder.WriteString("'") } builder.WriteString(")") return builder.String() } // Convert HTML content directly to markdown using domain-specific rules func (p *ArticleParser) Convert(htmlContent, domain, sourceURL string) (string, error) { content, err := p.Parse(htmlContent, domain, sourceURL) if err != nil { return "", err } return p.createMarkdown(content), nil } // GetSupportedDomains returns a list of domains that have parsing rules func (p *ArticleParser) GetSupportedDomains() []string { var domains []string for domain := range p.rules { domains = append(domains, domain) } return domains } // SaveArticle saves the parsed content to filesystem and returns file paths func (p *ArticleParser) SaveArticle(content *ParsedContent, dir string) (markdownPath, htmlPath string, err error) { if err := os.MkdirAll(dir, 0755); err != nil { return "", "", fmt.Errorf("failed to create storage directory: %w", err) } slug := p.slugify(content.Title) if slug == "" { slug = "article" } baseMarkdownPath := filepath.Join(dir, slug+".md") baseHTMLPath := filepath.Join(dir, slug+".html") markdownPath = baseMarkdownPath htmlPath = baseHTMLPath counter := 1 for { if _, err := os.Stat(markdownPath); os.IsNotExist(err) { if _, err := os.Stat(htmlPath); os.IsNotExist(err) { break } } markdownPath = filepath.Join(dir, fmt.Sprintf("%s_%d.md", slug, counter)) htmlPath = filepath.Join(dir, fmt.Sprintf("%s_%d.html", slug, counter)) counter++ } markdownContent := p.createMarkdown(content) if err := os.WriteFile(markdownPath, []byte(markdownContent), 0644); err != nil { return "", "", fmt.Errorf("failed to write markdown file: %w", err) } htmlContent := p.createHTML(content, markdownContent) if err := os.WriteFile(htmlPath, []byte(htmlContent), 0644); err != nil { os.Remove(markdownPath) return "", "", fmt.Errorf("failed to write HTML file: %w", err) } return markdownPath, htmlPath, nil } func (p *ArticleParser) slugify(title string) string { slug := strings.ToLower(title) reg := regexp.MustCompile(`[^a-z0-9]+`) slug = reg.ReplaceAllString(slug, "-") slug = strings.Trim(slug, "-") if len(slug) > 100 { slug = slug[:100] slug = strings.Trim(slug, "-") } return slug } func (p *ArticleParser) createMarkdown(content *ParsedContent) string { var builder strings.Builder builder.WriteString(fmt.Sprintf("# %s\n\n", content.Title)) if content.Author != "" { builder.WriteString(fmt.Sprintf("**Author:** %s\n\n", content.Author)) } if content.Date != "" { builder.WriteString(fmt.Sprintf("**Date:** %s\n\n", content.Date)) } builder.WriteString(fmt.Sprintf("**Source:** %s\n\n", content.URL)) builder.WriteString(fmt.Sprintf("**Saved:** %s\n\n", time.Now().Format("2006-01-02 15:04:05"))) builder.WriteString("---\n\n") builder.WriteString(content.Content) return builder.String() } func (p *ArticleParser) createHTML(content *ParsedContent, markdownContent string) string { extensions := parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock mdParser := parser.NewWithExtensions(extensions) doc := mdParser.Parse([]byte(markdownContent)) htmlFlags := html.CommonFlags | html.HrefTargetBlank opts := html.RendererOptions{Flags: htmlFlags} renderer := html.NewRenderer(opts) htmlBody := markdown.Render(doc, renderer) var builder strings.Builder builder.WriteString("\n") builder.WriteString("\n\n") builder.WriteString(fmt.Sprintf(" %s\n", content.Title)) builder.WriteString(" \n") builder.WriteString(" \n") builder.WriteString(" \n") builder.WriteString("\n\n") builder.Write(htmlBody) builder.WriteString("\n\n") return builder.String() } // CreateArticleFromURL is a convenience function that parses a URL and creates an instance of [models.Article] func CreateArticleFromURL(url, dir string) (*models.Article, error) { parser, err := NewArticleParser(http.DefaultClient) if err != nil { return nil, fmt.Errorf("failed to create parser: %w", err) } content, err := parser.ParseURL(url) if err != nil { return nil, fmt.Errorf("failed to parse URL: %w", err) } mdPath, htmlPath, err := parser.SaveArticle(content, dir) if err != nil { return nil, fmt.Errorf("failed to save article: %w", err) } return &models.Article{ URL: url, Title: content.Title, Author: content.Author, Date: content.Date, MarkdownPath: mdPath, HTMLPath: htmlPath, Created: time.Now(), Modified: time.Now(), }, nil }