cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm
leaflet
readability
golang
1package articles
2
3import (
4 "strings"
5
6 "github.com/antchfx/htmlquery"
7 "golang.org/x/net/html"
8)
9
10// ExtractionResult contains the results of heuristic content extraction.
11type ExtractionResult struct {
12 Content string
13 Title string
14 Author string
15 PublishedDate string
16 SiteName string
17 Language string
18 Confidence float64
19 ExtractionMethod string // "heuristic" or "xpath" or "dual"
20}
21
22// HeuristicExtractor implements Readability-style content extraction.
23type HeuristicExtractor struct {
24 scorer *Scorer
25}
26
27// NewHeuristicExtractor creates a new extractor with default scoring.
28func NewHeuristicExtractor() *HeuristicExtractor {
29 return &HeuristicExtractor{
30 scorer: NewScorer(),
31 }
32}
33
34// ExtractContent performs heuristic-based content extraction from an HTML document.
35func (e *HeuristicExtractor) ExtractContent(doc *html.Node) *ExtractionResult {
36 if doc == nil {
37 return nil
38 }
39
40 if !e.scorer.IsProbablyReadable(doc) {
41 return &ExtractionResult{
42 Confidence: 0.1,
43 ExtractionMethod: "heuristic",
44 }
45 }
46
47 cleaned := e.cleanDocument(doc)
48 candidates := e.scorer.FindTopCandidates(cleaned, 5)
49 if len(candidates) == 0 {
50 return &ExtractionResult{
51 Confidence: 0.2,
52 ExtractionMethod: "heuristic",
53 }
54 }
55
56 topCandidate := candidates[0]
57 content := e.extractTextContent(topCandidate.Node)
58 result := &ExtractionResult{
59 Content: content,
60 Confidence: topCandidate.ConfidenceLevel,
61 ExtractionMethod: "heuristic",
62 }
63
64 return result
65}
66
67// cleanDocument removes unwanted elements and prepares the document for extraction.
68func (e *HeuristicExtractor) cleanDocument(doc *html.Node) *html.Node {
69
70 cloned := e.cloneNode(doc)
71
72 e.removeElements(cloned, "script", "style", "noscript", "iframe", "embed", "object")
73 e.removeHiddenElements(cloned)
74 e.removeUnlikelyCandidates(cloned)
75 e.removeHighLinkDensityElements(cloned)
76
77 return cloned
78}
79
80// cloneNode creates a deep copy of an HTML node tree.
81func (e *HeuristicExtractor) cloneNode(node *html.Node) *html.Node {
82 if node == nil {
83 return nil
84 }
85
86 clone := &html.Node{
87 Type: node.Type,
88 Data: node.Data,
89 DataAtom: node.DataAtom,
90 Namespace: node.Namespace,
91 Attr: make([]html.Attribute, len(node.Attr)),
92 }
93
94 copy(clone.Attr, node.Attr)
95
96 for child := node.FirstChild; child != nil; child = child.NextSibling {
97 clonedChild := e.cloneNode(child)
98 if clonedChild != nil {
99 clone.AppendChild(clonedChild)
100 }
101 }
102
103 return clone
104}
105
106// removeElements removes all elements with the specified tag names.
107func (e *HeuristicExtractor) removeElements(root *html.Node, tagNames ...string) {
108 if root == nil {
109 return
110 }
111
112 tagMap := make(map[string]bool)
113 for _, tag := range tagNames {
114 tagMap[strings.ToLower(tag)] = true
115 }
116
117 var toRemove []*html.Node
118
119 var walk func(*html.Node)
120 walk = func(node *html.Node) {
121 if node.Type == html.ElementNode {
122 if tagMap[strings.ToLower(node.Data)] {
123 toRemove = append(toRemove, node)
124 return
125 }
126 }
127
128 for child := node.FirstChild; child != nil; child = child.NextSibling {
129 walk(child)
130 }
131 }
132
133 walk(root)
134
135 for _, node := range toRemove {
136 if node.Parent != nil {
137 node.Parent.RemoveChild(node)
138 }
139 }
140}
141
142// removeHiddenElements removes elements that are hidden via CSS or attributes.
143func (e *HeuristicExtractor) removeHiddenElements(root *html.Node) {
144 if root == nil {
145 return
146 }
147
148 var toRemove []*html.Node
149
150 var walk func(*html.Node)
151 walk = func(node *html.Node) {
152 if node.Type == html.ElementNode {
153 for _, attr := range node.Attr {
154 if attr.Key == "hidden" {
155 toRemove = append(toRemove, node)
156 return
157 }
158
159 if attr.Key == "style" {
160 style := strings.ToLower(attr.Val)
161 if strings.Contains(style, "display:none") || strings.Contains(style, "display: none") ||
162 strings.Contains(style, "visibility:hidden") || strings.Contains(style, "visibility: hidden") {
163 toRemove = append(toRemove, node)
164 return
165 }
166 }
167
168 if attr.Key == "aria-hidden" && attr.Val == "true" {
169 toRemove = append(toRemove, node)
170 return
171 }
172 }
173 }
174
175 for child := node.FirstChild; child != nil; child = child.NextSibling {
176 walk(child)
177 }
178 }
179
180 walk(root)
181
182 for _, node := range toRemove {
183 if node.Parent != nil {
184 node.Parent.RemoveChild(node)
185 }
186 }
187}
188
189// removeUnlikelyCandidates removes elements that are unlikely to be main content.
190func (e *HeuristicExtractor) removeUnlikelyCandidates(root *html.Node) {
191 if root == nil {
192 return
193 }
194
195 var toRemove []*html.Node
196
197 var walk func(*html.Node)
198 walk = func(node *html.Node) {
199 if node.Type == html.ElementNode {
200 score := e.scorer.getClassIdScore(node)
201
202 if score < -40 {
203 toRemove = append(toRemove, node)
204 return
205 }
206 }
207
208 for child := node.FirstChild; child != nil; child = child.NextSibling {
209 walk(child)
210 }
211 }
212
213 walk(root)
214
215 for _, node := range toRemove {
216 if node.Parent != nil {
217 node.Parent.RemoveChild(node)
218 }
219 }
220}
221
222// removeHighLinkDensityElements removes elements with excessive link density.
223func (e *HeuristicExtractor) removeHighLinkDensityElements(root *html.Node) {
224 if root == nil {
225 return
226 }
227
228 const linkDensityThreshold = 0.75
229
230 var toRemove []*html.Node
231
232 var walk func(*html.Node)
233 walk = func(node *html.Node) {
234 if node.Type == html.ElementNode {
235 if strings.ToLower(node.Data) == "a" {
236 for child := node.FirstChild; child != nil; child = child.NextSibling {
237 walk(child)
238 }
239 return
240 }
241
242 density := e.scorer.calculateLinkDensity(node)
243 textLen := e.scorer.calculateTextLength(node)
244
245 if density > linkDensityThreshold && textLen < 500 {
246 toRemove = append(toRemove, node)
247 return
248 }
249 }
250
251 for child := node.FirstChild; child != nil; child = child.NextSibling {
252 walk(child)
253 }
254 }
255
256 walk(root)
257
258 for _, node := range toRemove {
259 if node.Parent != nil {
260 node.Parent.RemoveChild(node)
261 }
262 }
263}
264
265// extractTextContent extracts cleaned text from a node.
266func (e *HeuristicExtractor) extractTextContent(node *html.Node) string {
267 if node == nil {
268 return ""
269 }
270
271 var buf strings.Builder
272 e.extractTextRecursive(node, &buf)
273
274 text := buf.String()
275 text = normalizeWhitespace(text)
276 text = strings.TrimSpace(text)
277
278 return text
279}
280
281// extractTextRecursive recursively extracts text with basic formatting.
282func (e *HeuristicExtractor) extractTextRecursive(node *html.Node, buf *strings.Builder) {
283 if node == nil {
284 return
285 }
286
287 if node.Type == html.TextNode {
288 buf.WriteString(node.Data)
289 return
290 }
291
292 if node.Type == html.ElementNode {
293 tag := strings.ToLower(node.Data)
294
295 if e.isBlockElement(tag) && buf.Len() > 0 {
296 buf.WriteString("\n\n")
297 }
298
299 if tag == "li" {
300 buf.WriteString("\n• ")
301 }
302
303 for child := node.FirstChild; child != nil; child = child.NextSibling {
304 e.extractTextRecursive(child, buf)
305 }
306
307 if e.isBlockElement(tag) {
308 buf.WriteString("\n")
309 }
310 }
311}
312
313// isBlockElement returns true for block-level HTML elements.
314func (e *HeuristicExtractor) isBlockElement(tagName string) bool {
315 blockElements := map[string]bool{
316 "p": true,
317 "div": true,
318 "article": true,
319 "section": true,
320 "h1": true,
321 "h2": true,
322 "h3": true,
323 "h4": true,
324 "h5": true,
325 "h6": true,
326 "blockquote": true,
327 "pre": true,
328 "ul": true,
329 "ol": true,
330 "table": true,
331 "tr": true,
332 "td": true,
333 "th": true,
334 }
335
336 return blockElements[tagName]
337}
338
339// CompareWithXPath compares heuristic extraction with XPath-based extraction.
340func (e *HeuristicExtractor) CompareWithXPath(doc *html.Node, xpathNode *html.Node) *ExtractionResult {
341 if doc == nil {
342 return nil
343 }
344
345 heuristicResult := e.ExtractContent(doc)
346 if heuristicResult == nil {
347 heuristicResult = &ExtractionResult{
348 ExtractionMethod: "heuristic",
349 Confidence: 0.0,
350 }
351 }
352
353 if xpathNode == nil {
354 return heuristicResult
355 }
356
357 xpathContent := e.extractTextContent(xpathNode)
358 xpathLen := len(xpathContent)
359 heuristicLen := len(heuristicResult.Content)
360
361 similarity := e.calculateSimilarity(xpathContent, heuristicResult.Content)
362
363 if similarity > 0.8 {
364 heuristicResult.Confidence = 0.95
365 heuristicResult.ExtractionMethod = "dual-validated"
366 return heuristicResult
367 } else if float64(xpathLen) > float64(heuristicLen)*1.5 {
368 return &ExtractionResult{
369 Content: xpathContent,
370 Confidence: 0.85,
371 ExtractionMethod: "xpath-preferred",
372 }
373 } else if float64(heuristicLen) > float64(xpathLen)*1.5 {
374 heuristicResult.Confidence = 0.80
375 heuristicResult.ExtractionMethod = "heuristic-preferred"
376 return heuristicResult
377 } else {
378 heuristicResult.Confidence = 0.70
379 heuristicResult.ExtractionMethod = "heuristic-fallback"
380 return heuristicResult
381 }
382}
383
384// calculateSimilarity estimates content similarity (simple ratio of common words).
385func (e *HeuristicExtractor) calculateSimilarity(text1, text2 string) float64 {
386 if len(text1) == 0 || len(text2) == 0 {
387 if len(text1) == 0 && len(text2) == 0 {
388 return 1.0
389 }
390 return 0.0
391 }
392
393 words1 := strings.Fields(strings.ToLower(text1))
394 words2 := strings.Fields(strings.ToLower(text2))
395
396 if len(words1) == 0 || len(words2) == 0 {
397 return 0.0
398 }
399
400 freq1 := make(map[string]int)
401 freq2 := make(map[string]int)
402
403 for _, word := range words1 {
404 freq1[word]++
405 }
406
407 for _, word := range words2 {
408 freq2[word]++
409 }
410
411 common := 0
412 for word := range freq1 {
413 if freq2[word] > 0 {
414 common++
415 }
416 }
417
418 union := len(freq1) + len(freq2) - common
419 if union == 0 {
420 return 0.0
421 }
422
423 return float64(common) / float64(union)
424}
425
426// ExtractWithSemanticHTML attempts extraction using semantic HTML5 elements first.
427// Falls back to heuristic scoring if semantic elements aren't found.
428func (e *HeuristicExtractor) ExtractWithSemanticHTML(doc *html.Node) *ExtractionResult {
429 if doc == nil {
430 return nil
431 }
432
433 articleNode := htmlquery.FindOne(doc, "//article")
434 if articleNode != nil {
435 content := e.extractTextContent(articleNode)
436 if len(content) > e.scorer.minContentLength {
437 return &ExtractionResult{
438 Content: content,
439 Confidence: 0.90,
440 ExtractionMethod: "semantic-html",
441 }
442 }
443 }
444
445 mainNode := htmlquery.FindOne(doc, "//main")
446 if mainNode != nil {
447 content := e.extractTextContent(mainNode)
448 if len(content) > e.scorer.minContentLength {
449 return &ExtractionResult{
450 Content: content,
451 Confidence: 0.88,
452 ExtractionMethod: "semantic-html",
453 }
454 }
455 }
456
457 return e.ExtractContent(doc)
458}