internal/articles/heuristics.go at main · desertthunder.dev/noteleaf

cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
noteleaf / internal / articles / heuristics.go
at main 458 lines 10 kB view raw
  1package articles
  2
  3import (
  4	"strings"
  5
  6	"github.com/antchfx/htmlquery"
  7	"golang.org/x/net/html"
  8)
  9
 10// ExtractionResult contains the results of heuristic content extraction.
 11type ExtractionResult struct {
 12	Content          string
 13	Title            string
 14	Author           string
 15	PublishedDate    string
 16	SiteName         string
 17	Language         string
 18	Confidence       float64
 19	ExtractionMethod string // "heuristic" or "xpath" or "dual"
 20}
 21
 22// HeuristicExtractor implements Readability-style content extraction.
 23type HeuristicExtractor struct {
 24	scorer *Scorer
 25}
 26
 27// NewHeuristicExtractor creates a new extractor with default scoring.
 28func NewHeuristicExtractor() *HeuristicExtractor {
 29	return &HeuristicExtractor{
 30		scorer: NewScorer(),
 31	}
 32}
 33
 34// ExtractContent performs heuristic-based content extraction from an HTML document.
 35func (e *HeuristicExtractor) ExtractContent(doc *html.Node) *ExtractionResult {
 36	if doc == nil {
 37		return nil
 38	}
 39
 40	if !e.scorer.IsProbablyReadable(doc) {
 41		return &ExtractionResult{
 42			Confidence:       0.1,
 43			ExtractionMethod: "heuristic",
 44		}
 45	}
 46
 47	cleaned := e.cleanDocument(doc)
 48	candidates := e.scorer.FindTopCandidates(cleaned, 5)
 49	if len(candidates) == 0 {
 50		return &ExtractionResult{
 51			Confidence:       0.2,
 52			ExtractionMethod: "heuristic",
 53		}
 54	}
 55
 56	topCandidate := candidates[0]
 57	content := e.extractTextContent(topCandidate.Node)
 58	result := &ExtractionResult{
 59		Content:          content,
 60		Confidence:       topCandidate.ConfidenceLevel,
 61		ExtractionMethod: "heuristic",
 62	}
 63
 64	return result
 65}
 66
 67// cleanDocument removes unwanted elements and prepares the document for extraction.
 68func (e *HeuristicExtractor) cleanDocument(doc *html.Node) *html.Node {
 69
 70	cloned := e.cloneNode(doc)
 71
 72	e.removeElements(cloned, "script", "style", "noscript", "iframe", "embed", "object")
 73	e.removeHiddenElements(cloned)
 74	e.removeUnlikelyCandidates(cloned)
 75	e.removeHighLinkDensityElements(cloned)
 76
 77	return cloned
 78}
 79
 80// cloneNode creates a deep copy of an HTML node tree.
 81func (e *HeuristicExtractor) cloneNode(node *html.Node) *html.Node {
 82	if node == nil {
 83		return nil
 84	}
 85
 86	clone := &html.Node{
 87		Type:      node.Type,
 88		Data:      node.Data,
 89		DataAtom:  node.DataAtom,
 90		Namespace: node.Namespace,
 91		Attr:      make([]html.Attribute, len(node.Attr)),
 92	}
 93
 94	copy(clone.Attr, node.Attr)
 95
 96	for child := node.FirstChild; child != nil; child = child.NextSibling {
 97		clonedChild := e.cloneNode(child)
 98		if clonedChild != nil {
 99			clone.AppendChild(clonedChild)
100		}
101	}
102
103	return clone
104}
105
106// removeElements removes all elements with the specified tag names.
107func (e *HeuristicExtractor) removeElements(root *html.Node, tagNames ...string) {
108	if root == nil {
109		return
110	}
111
112	tagMap := make(map[string]bool)
113	for _, tag := range tagNames {
114		tagMap[strings.ToLower(tag)] = true
115	}
116
117	var toRemove []*html.Node
118
119	var walk func(*html.Node)
120	walk = func(node *html.Node) {
121		if node.Type == html.ElementNode {
122			if tagMap[strings.ToLower(node.Data)] {
123				toRemove = append(toRemove, node)
124				return
125			}
126		}
127
128		for child := node.FirstChild; child != nil; child = child.NextSibling {
129			walk(child)
130		}
131	}
132
133	walk(root)
134
135	for _, node := range toRemove {
136		if node.Parent != nil {
137			node.Parent.RemoveChild(node)
138		}
139	}
140}
141
142// removeHiddenElements removes elements that are hidden via CSS or attributes.
143func (e *HeuristicExtractor) removeHiddenElements(root *html.Node) {
144	if root == nil {
145		return
146	}
147
148	var toRemove []*html.Node
149
150	var walk func(*html.Node)
151	walk = func(node *html.Node) {
152		if node.Type == html.ElementNode {
153			for _, attr := range node.Attr {
154				if attr.Key == "hidden" {
155					toRemove = append(toRemove, node)
156					return
157				}
158
159				if attr.Key == "style" {
160					style := strings.ToLower(attr.Val)
161					if strings.Contains(style, "display:none") || strings.Contains(style, "display: none") ||
162						strings.Contains(style, "visibility:hidden") || strings.Contains(style, "visibility: hidden") {
163						toRemove = append(toRemove, node)
164						return
165					}
166				}
167
168				if attr.Key == "aria-hidden" && attr.Val == "true" {
169					toRemove = append(toRemove, node)
170					return
171				}
172			}
173		}
174
175		for child := node.FirstChild; child != nil; child = child.NextSibling {
176			walk(child)
177		}
178	}
179
180	walk(root)
181
182	for _, node := range toRemove {
183		if node.Parent != nil {
184			node.Parent.RemoveChild(node)
185		}
186	}
187}
188
189// removeUnlikelyCandidates removes elements that are unlikely to be main content.
190func (e *HeuristicExtractor) removeUnlikelyCandidates(root *html.Node) {
191	if root == nil {
192		return
193	}
194
195	var toRemove []*html.Node
196
197	var walk func(*html.Node)
198	walk = func(node *html.Node) {
199		if node.Type == html.ElementNode {
200			score := e.scorer.getClassIdScore(node)
201
202			if score < -40 {
203				toRemove = append(toRemove, node)
204				return
205			}
206		}
207
208		for child := node.FirstChild; child != nil; child = child.NextSibling {
209			walk(child)
210		}
211	}
212
213	walk(root)
214
215	for _, node := range toRemove {
216		if node.Parent != nil {
217			node.Parent.RemoveChild(node)
218		}
219	}
220}
221
222// removeHighLinkDensityElements removes elements with excessive link density.
223func (e *HeuristicExtractor) removeHighLinkDensityElements(root *html.Node) {
224	if root == nil {
225		return
226	}
227
228	const linkDensityThreshold = 0.75
229
230	var toRemove []*html.Node
231
232	var walk func(*html.Node)
233	walk = func(node *html.Node) {
234		if node.Type == html.ElementNode {
235			if strings.ToLower(node.Data) == "a" {
236				for child := node.FirstChild; child != nil; child = child.NextSibling {
237					walk(child)
238				}
239				return
240			}
241
242			density := e.scorer.calculateLinkDensity(node)
243			textLen := e.scorer.calculateTextLength(node)
244
245			if density > linkDensityThreshold && textLen < 500 {
246				toRemove = append(toRemove, node)
247				return
248			}
249		}
250
251		for child := node.FirstChild; child != nil; child = child.NextSibling {
252			walk(child)
253		}
254	}
255
256	walk(root)
257
258	for _, node := range toRemove {
259		if node.Parent != nil {
260			node.Parent.RemoveChild(node)
261		}
262	}
263}
264
265// extractTextContent extracts cleaned text from a node.
266func (e *HeuristicExtractor) extractTextContent(node *html.Node) string {
267	if node == nil {
268		return ""
269	}
270
271	var buf strings.Builder
272	e.extractTextRecursive(node, &buf)
273
274	text := buf.String()
275	text = normalizeWhitespace(text)
276	text = strings.TrimSpace(text)
277
278	return text
279}
280
281// extractTextRecursive recursively extracts text with basic formatting.
282func (e *HeuristicExtractor) extractTextRecursive(node *html.Node, buf *strings.Builder) {
283	if node == nil {
284		return
285	}
286
287	if node.Type == html.TextNode {
288		buf.WriteString(node.Data)
289		return
290	}
291
292	if node.Type == html.ElementNode {
293		tag := strings.ToLower(node.Data)
294
295		if e.isBlockElement(tag) && buf.Len() > 0 {
296			buf.WriteString("\n\n")
297		}
298
299		if tag == "li" {
300			buf.WriteString("\n• ")
301		}
302
303		for child := node.FirstChild; child != nil; child = child.NextSibling {
304			e.extractTextRecursive(child, buf)
305		}
306
307		if e.isBlockElement(tag) {
308			buf.WriteString("\n")
309		}
310	}
311}
312
313// isBlockElement returns true for block-level HTML elements.
314func (e *HeuristicExtractor) isBlockElement(tagName string) bool {
315	blockElements := map[string]bool{
316		"p":          true,
317		"div":        true,
318		"article":    true,
319		"section":    true,
320		"h1":         true,
321		"h2":         true,
322		"h3":         true,
323		"h4":         true,
324		"h5":         true,
325		"h6":         true,
326		"blockquote": true,
327		"pre":        true,
328		"ul":         true,
329		"ol":         true,
330		"table":      true,
331		"tr":         true,
332		"td":         true,
333		"th":         true,
334	}
335
336	return blockElements[tagName]
337}
338
339// CompareWithXPath compares heuristic extraction with XPath-based extraction.
340func (e *HeuristicExtractor) CompareWithXPath(doc *html.Node, xpathNode *html.Node) *ExtractionResult {
341	if doc == nil {
342		return nil
343	}
344
345	heuristicResult := e.ExtractContent(doc)
346	if heuristicResult == nil {
347		heuristicResult = &ExtractionResult{
348			ExtractionMethod: "heuristic",
349			Confidence:       0.0,
350		}
351	}
352
353	if xpathNode == nil {
354		return heuristicResult
355	}
356
357	xpathContent := e.extractTextContent(xpathNode)
358	xpathLen := len(xpathContent)
359	heuristicLen := len(heuristicResult.Content)
360
361	similarity := e.calculateSimilarity(xpathContent, heuristicResult.Content)
362
363	if similarity > 0.8 {
364		heuristicResult.Confidence = 0.95
365		heuristicResult.ExtractionMethod = "dual-validated"
366		return heuristicResult
367	} else if float64(xpathLen) > float64(heuristicLen)*1.5 {
368		return &ExtractionResult{
369			Content:          xpathContent,
370			Confidence:       0.85,
371			ExtractionMethod: "xpath-preferred",
372		}
373	} else if float64(heuristicLen) > float64(xpathLen)*1.5 {
374		heuristicResult.Confidence = 0.80
375		heuristicResult.ExtractionMethod = "heuristic-preferred"
376		return heuristicResult
377	} else {
378		heuristicResult.Confidence = 0.70
379		heuristicResult.ExtractionMethod = "heuristic-fallback"
380		return heuristicResult
381	}
382}
383
384// calculateSimilarity estimates content similarity (simple ratio of common words).
385func (e *HeuristicExtractor) calculateSimilarity(text1, text2 string) float64 {
386	if len(text1) == 0 || len(text2) == 0 {
387		if len(text1) == 0 && len(text2) == 0 {
388			return 1.0
389		}
390		return 0.0
391	}
392
393	words1 := strings.Fields(strings.ToLower(text1))
394	words2 := strings.Fields(strings.ToLower(text2))
395
396	if len(words1) == 0 || len(words2) == 0 {
397		return 0.0
398	}
399
400	freq1 := make(map[string]int)
401	freq2 := make(map[string]int)
402
403	for _, word := range words1 {
404		freq1[word]++
405	}
406
407	for _, word := range words2 {
408		freq2[word]++
409	}
410
411	common := 0
412	for word := range freq1 {
413		if freq2[word] > 0 {
414			common++
415		}
416	}
417
418	union := len(freq1) + len(freq2) - common
419	if union == 0 {
420		return 0.0
421	}
422
423	return float64(common) / float64(union)
424}
425
426// ExtractWithSemanticHTML attempts extraction using semantic HTML5 elements first.
427// Falls back to heuristic scoring if semantic elements aren't found.
428func (e *HeuristicExtractor) ExtractWithSemanticHTML(doc *html.Node) *ExtractionResult {
429	if doc == nil {
430		return nil
431	}
432
433	articleNode := htmlquery.FindOne(doc, "//article")
434	if articleNode != nil {
435		content := e.extractTextContent(articleNode)
436		if len(content) > e.scorer.minContentLength {
437			return &ExtractionResult{
438				Content:          content,
439				Confidence:       0.90,
440				ExtractionMethod: "semantic-html",
441			}
442		}
443	}
444
445	mainNode := htmlquery.FindOne(doc, "//main")
446	if mainNode != nil {
447		content := e.extractTextContent(mainNode)
448		if len(content) > e.scorer.minContentLength {
449			return &ExtractionResult{
450				Content:          content,
451				Confidence:       0.88,
452				ExtractionMethod: "semantic-html",
453			}
454		}
455	}
456
457	return e.ExtractContent(doc)
458}