package articles import ( "strings" "github.com/antchfx/htmlquery" "golang.org/x/net/html" ) // ExtractionResult contains the results of heuristic content extraction. type ExtractionResult struct { Content string Title string Author string PublishedDate string SiteName string Language string Confidence float64 ExtractionMethod string // "heuristic" or "xpath" or "dual" } // HeuristicExtractor implements Readability-style content extraction. type HeuristicExtractor struct { scorer *Scorer } // NewHeuristicExtractor creates a new extractor with default scoring. func NewHeuristicExtractor() *HeuristicExtractor { return &HeuristicExtractor{ scorer: NewScorer(), } } // ExtractContent performs heuristic-based content extraction from an HTML document. func (e *HeuristicExtractor) ExtractContent(doc *html.Node) *ExtractionResult { if doc == nil { return nil } if !e.scorer.IsProbablyReadable(doc) { return &ExtractionResult{ Confidence: 0.1, ExtractionMethod: "heuristic", } } cleaned := e.cleanDocument(doc) candidates := e.scorer.FindTopCandidates(cleaned, 5) if len(candidates) == 0 { return &ExtractionResult{ Confidence: 0.2, ExtractionMethod: "heuristic", } } topCandidate := candidates[0] content := e.extractTextContent(topCandidate.Node) result := &ExtractionResult{ Content: content, Confidence: topCandidate.ConfidenceLevel, ExtractionMethod: "heuristic", } return result } // cleanDocument removes unwanted elements and prepares the document for extraction. func (e *HeuristicExtractor) cleanDocument(doc *html.Node) *html.Node { cloned := e.cloneNode(doc) e.removeElements(cloned, "script", "style", "noscript", "iframe", "embed", "object") e.removeHiddenElements(cloned) e.removeUnlikelyCandidates(cloned) e.removeHighLinkDensityElements(cloned) return cloned } // cloneNode creates a deep copy of an HTML node tree. func (e *HeuristicExtractor) cloneNode(node *html.Node) *html.Node { if node == nil { return nil } clone := &html.Node{ Type: node.Type, Data: node.Data, DataAtom: node.DataAtom, Namespace: node.Namespace, Attr: make([]html.Attribute, len(node.Attr)), } copy(clone.Attr, node.Attr) for child := node.FirstChild; child != nil; child = child.NextSibling { clonedChild := e.cloneNode(child) if clonedChild != nil { clone.AppendChild(clonedChild) } } return clone } // removeElements removes all elements with the specified tag names. func (e *HeuristicExtractor) removeElements(root *html.Node, tagNames ...string) { if root == nil { return } tagMap := make(map[string]bool) for _, tag := range tagNames { tagMap[strings.ToLower(tag)] = true } var toRemove []*html.Node var walk func(*html.Node) walk = func(node *html.Node) { if node.Type == html.ElementNode { if tagMap[strings.ToLower(node.Data)] { toRemove = append(toRemove, node) return } } for child := node.FirstChild; child != nil; child = child.NextSibling { walk(child) } } walk(root) for _, node := range toRemove { if node.Parent != nil { node.Parent.RemoveChild(node) } } } // removeHiddenElements removes elements that are hidden via CSS or attributes. func (e *HeuristicExtractor) removeHiddenElements(root *html.Node) { if root == nil { return } var toRemove []*html.Node var walk func(*html.Node) walk = func(node *html.Node) { if node.Type == html.ElementNode { for _, attr := range node.Attr { if attr.Key == "hidden" { toRemove = append(toRemove, node) return } if attr.Key == "style" { style := strings.ToLower(attr.Val) if strings.Contains(style, "display:none") || strings.Contains(style, "display: none") || strings.Contains(style, "visibility:hidden") || strings.Contains(style, "visibility: hidden") { toRemove = append(toRemove, node) return } } if attr.Key == "aria-hidden" && attr.Val == "true" { toRemove = append(toRemove, node) return } } } for child := node.FirstChild; child != nil; child = child.NextSibling { walk(child) } } walk(root) for _, node := range toRemove { if node.Parent != nil { node.Parent.RemoveChild(node) } } } // removeUnlikelyCandidates removes elements that are unlikely to be main content. func (e *HeuristicExtractor) removeUnlikelyCandidates(root *html.Node) { if root == nil { return } var toRemove []*html.Node var walk func(*html.Node) walk = func(node *html.Node) { if node.Type == html.ElementNode { score := e.scorer.getClassIdScore(node) if score < -40 { toRemove = append(toRemove, node) return } } for child := node.FirstChild; child != nil; child = child.NextSibling { walk(child) } } walk(root) for _, node := range toRemove { if node.Parent != nil { node.Parent.RemoveChild(node) } } } // removeHighLinkDensityElements removes elements with excessive link density. func (e *HeuristicExtractor) removeHighLinkDensityElements(root *html.Node) { if root == nil { return } const linkDensityThreshold = 0.75 var toRemove []*html.Node var walk func(*html.Node) walk = func(node *html.Node) { if node.Type == html.ElementNode { if strings.ToLower(node.Data) == "a" { for child := node.FirstChild; child != nil; child = child.NextSibling { walk(child) } return } density := e.scorer.calculateLinkDensity(node) textLen := e.scorer.calculateTextLength(node) if density > linkDensityThreshold && textLen < 500 { toRemove = append(toRemove, node) return } } for child := node.FirstChild; child != nil; child = child.NextSibling { walk(child) } } walk(root) for _, node := range toRemove { if node.Parent != nil { node.Parent.RemoveChild(node) } } } // extractTextContent extracts cleaned text from a node. func (e *HeuristicExtractor) extractTextContent(node *html.Node) string { if node == nil { return "" } var buf strings.Builder e.extractTextRecursive(node, &buf) text := buf.String() text = normalizeWhitespace(text) text = strings.TrimSpace(text) return text } // extractTextRecursive recursively extracts text with basic formatting. func (e *HeuristicExtractor) extractTextRecursive(node *html.Node, buf *strings.Builder) { if node == nil { return } if node.Type == html.TextNode { buf.WriteString(node.Data) return } if node.Type == html.ElementNode { tag := strings.ToLower(node.Data) if e.isBlockElement(tag) && buf.Len() > 0 { buf.WriteString("\n\n") } if tag == "li" { buf.WriteString("\n• ") } for child := node.FirstChild; child != nil; child = child.NextSibling { e.extractTextRecursive(child, buf) } if e.isBlockElement(tag) { buf.WriteString("\n") } } } // isBlockElement returns true for block-level HTML elements. func (e *HeuristicExtractor) isBlockElement(tagName string) bool { blockElements := map[string]bool{ "p": true, "div": true, "article": true, "section": true, "h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true, "blockquote": true, "pre": true, "ul": true, "ol": true, "table": true, "tr": true, "td": true, "th": true, } return blockElements[tagName] } // CompareWithXPath compares heuristic extraction with XPath-based extraction. func (e *HeuristicExtractor) CompareWithXPath(doc *html.Node, xpathNode *html.Node) *ExtractionResult { if doc == nil { return nil } heuristicResult := e.ExtractContent(doc) if heuristicResult == nil { heuristicResult = &ExtractionResult{ ExtractionMethod: "heuristic", Confidence: 0.0, } } if xpathNode == nil { return heuristicResult } xpathContent := e.extractTextContent(xpathNode) xpathLen := len(xpathContent) heuristicLen := len(heuristicResult.Content) similarity := e.calculateSimilarity(xpathContent, heuristicResult.Content) if similarity > 0.8 { heuristicResult.Confidence = 0.95 heuristicResult.ExtractionMethod = "dual-validated" return heuristicResult } else if float64(xpathLen) > float64(heuristicLen)*1.5 { return &ExtractionResult{ Content: xpathContent, Confidence: 0.85, ExtractionMethod: "xpath-preferred", } } else if float64(heuristicLen) > float64(xpathLen)*1.5 { heuristicResult.Confidence = 0.80 heuristicResult.ExtractionMethod = "heuristic-preferred" return heuristicResult } else { heuristicResult.Confidence = 0.70 heuristicResult.ExtractionMethod = "heuristic-fallback" return heuristicResult } } // calculateSimilarity estimates content similarity (simple ratio of common words). func (e *HeuristicExtractor) calculateSimilarity(text1, text2 string) float64 { if len(text1) == 0 || len(text2) == 0 { if len(text1) == 0 && len(text2) == 0 { return 1.0 } return 0.0 } words1 := strings.Fields(strings.ToLower(text1)) words2 := strings.Fields(strings.ToLower(text2)) if len(words1) == 0 || len(words2) == 0 { return 0.0 } freq1 := make(map[string]int) freq2 := make(map[string]int) for _, word := range words1 { freq1[word]++ } for _, word := range words2 { freq2[word]++ } common := 0 for word := range freq1 { if freq2[word] > 0 { common++ } } union := len(freq1) + len(freq2) - common if union == 0 { return 0.0 } return float64(common) / float64(union) } // ExtractWithSemanticHTML attempts extraction using semantic HTML5 elements first. // Falls back to heuristic scoring if semantic elements aren't found. func (e *HeuristicExtractor) ExtractWithSemanticHTML(doc *html.Node) *ExtractionResult { if doc == nil { return nil } articleNode := htmlquery.FindOne(doc, "//article") if articleNode != nil { content := e.extractTextContent(articleNode) if len(content) > e.scorer.minContentLength { return &ExtractionResult{ Content: content, Confidence: 0.90, ExtractionMethod: "semantic-html", } } } mainNode := htmlquery.FindOne(doc, "//main") if mainNode != nil { content := e.extractTextContent(mainNode) if len(content) > e.scorer.minContentLength { return &ExtractionResult{ Content: content, Confidence: 0.88, ExtractionMethod: "semantic-html", } } } return e.ExtractContent(doc) }