cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
at main 458 lines 10 kB view raw
1package articles 2 3import ( 4 "strings" 5 6 "github.com/antchfx/htmlquery" 7 "golang.org/x/net/html" 8) 9 10// ExtractionResult contains the results of heuristic content extraction. 11type ExtractionResult struct { 12 Content string 13 Title string 14 Author string 15 PublishedDate string 16 SiteName string 17 Language string 18 Confidence float64 19 ExtractionMethod string // "heuristic" or "xpath" or "dual" 20} 21 22// HeuristicExtractor implements Readability-style content extraction. 23type HeuristicExtractor struct { 24 scorer *Scorer 25} 26 27// NewHeuristicExtractor creates a new extractor with default scoring. 28func NewHeuristicExtractor() *HeuristicExtractor { 29 return &HeuristicExtractor{ 30 scorer: NewScorer(), 31 } 32} 33 34// ExtractContent performs heuristic-based content extraction from an HTML document. 35func (e *HeuristicExtractor) ExtractContent(doc *html.Node) *ExtractionResult { 36 if doc == nil { 37 return nil 38 } 39 40 if !e.scorer.IsProbablyReadable(doc) { 41 return &ExtractionResult{ 42 Confidence: 0.1, 43 ExtractionMethod: "heuristic", 44 } 45 } 46 47 cleaned := e.cleanDocument(doc) 48 candidates := e.scorer.FindTopCandidates(cleaned, 5) 49 if len(candidates) == 0 { 50 return &ExtractionResult{ 51 Confidence: 0.2, 52 ExtractionMethod: "heuristic", 53 } 54 } 55 56 topCandidate := candidates[0] 57 content := e.extractTextContent(topCandidate.Node) 58 result := &ExtractionResult{ 59 Content: content, 60 Confidence: topCandidate.ConfidenceLevel, 61 ExtractionMethod: "heuristic", 62 } 63 64 return result 65} 66 67// cleanDocument removes unwanted elements and prepares the document for extraction. 68func (e *HeuristicExtractor) cleanDocument(doc *html.Node) *html.Node { 69 70 cloned := e.cloneNode(doc) 71 72 e.removeElements(cloned, "script", "style", "noscript", "iframe", "embed", "object") 73 e.removeHiddenElements(cloned) 74 e.removeUnlikelyCandidates(cloned) 75 e.removeHighLinkDensityElements(cloned) 76 77 return cloned 78} 79 80// cloneNode creates a deep copy of an HTML node tree. 81func (e *HeuristicExtractor) cloneNode(node *html.Node) *html.Node { 82 if node == nil { 83 return nil 84 } 85 86 clone := &html.Node{ 87 Type: node.Type, 88 Data: node.Data, 89 DataAtom: node.DataAtom, 90 Namespace: node.Namespace, 91 Attr: make([]html.Attribute, len(node.Attr)), 92 } 93 94 copy(clone.Attr, node.Attr) 95 96 for child := node.FirstChild; child != nil; child = child.NextSibling { 97 clonedChild := e.cloneNode(child) 98 if clonedChild != nil { 99 clone.AppendChild(clonedChild) 100 } 101 } 102 103 return clone 104} 105 106// removeElements removes all elements with the specified tag names. 107func (e *HeuristicExtractor) removeElements(root *html.Node, tagNames ...string) { 108 if root == nil { 109 return 110 } 111 112 tagMap := make(map[string]bool) 113 for _, tag := range tagNames { 114 tagMap[strings.ToLower(tag)] = true 115 } 116 117 var toRemove []*html.Node 118 119 var walk func(*html.Node) 120 walk = func(node *html.Node) { 121 if node.Type == html.ElementNode { 122 if tagMap[strings.ToLower(node.Data)] { 123 toRemove = append(toRemove, node) 124 return 125 } 126 } 127 128 for child := node.FirstChild; child != nil; child = child.NextSibling { 129 walk(child) 130 } 131 } 132 133 walk(root) 134 135 for _, node := range toRemove { 136 if node.Parent != nil { 137 node.Parent.RemoveChild(node) 138 } 139 } 140} 141 142// removeHiddenElements removes elements that are hidden via CSS or attributes. 143func (e *HeuristicExtractor) removeHiddenElements(root *html.Node) { 144 if root == nil { 145 return 146 } 147 148 var toRemove []*html.Node 149 150 var walk func(*html.Node) 151 walk = func(node *html.Node) { 152 if node.Type == html.ElementNode { 153 for _, attr := range node.Attr { 154 if attr.Key == "hidden" { 155 toRemove = append(toRemove, node) 156 return 157 } 158 159 if attr.Key == "style" { 160 style := strings.ToLower(attr.Val) 161 if strings.Contains(style, "display:none") || strings.Contains(style, "display: none") || 162 strings.Contains(style, "visibility:hidden") || strings.Contains(style, "visibility: hidden") { 163 toRemove = append(toRemove, node) 164 return 165 } 166 } 167 168 if attr.Key == "aria-hidden" && attr.Val == "true" { 169 toRemove = append(toRemove, node) 170 return 171 } 172 } 173 } 174 175 for child := node.FirstChild; child != nil; child = child.NextSibling { 176 walk(child) 177 } 178 } 179 180 walk(root) 181 182 for _, node := range toRemove { 183 if node.Parent != nil { 184 node.Parent.RemoveChild(node) 185 } 186 } 187} 188 189// removeUnlikelyCandidates removes elements that are unlikely to be main content. 190func (e *HeuristicExtractor) removeUnlikelyCandidates(root *html.Node) { 191 if root == nil { 192 return 193 } 194 195 var toRemove []*html.Node 196 197 var walk func(*html.Node) 198 walk = func(node *html.Node) { 199 if node.Type == html.ElementNode { 200 score := e.scorer.getClassIdScore(node) 201 202 if score < -40 { 203 toRemove = append(toRemove, node) 204 return 205 } 206 } 207 208 for child := node.FirstChild; child != nil; child = child.NextSibling { 209 walk(child) 210 } 211 } 212 213 walk(root) 214 215 for _, node := range toRemove { 216 if node.Parent != nil { 217 node.Parent.RemoveChild(node) 218 } 219 } 220} 221 222// removeHighLinkDensityElements removes elements with excessive link density. 223func (e *HeuristicExtractor) removeHighLinkDensityElements(root *html.Node) { 224 if root == nil { 225 return 226 } 227 228 const linkDensityThreshold = 0.75 229 230 var toRemove []*html.Node 231 232 var walk func(*html.Node) 233 walk = func(node *html.Node) { 234 if node.Type == html.ElementNode { 235 if strings.ToLower(node.Data) == "a" { 236 for child := node.FirstChild; child != nil; child = child.NextSibling { 237 walk(child) 238 } 239 return 240 } 241 242 density := e.scorer.calculateLinkDensity(node) 243 textLen := e.scorer.calculateTextLength(node) 244 245 if density > linkDensityThreshold && textLen < 500 { 246 toRemove = append(toRemove, node) 247 return 248 } 249 } 250 251 for child := node.FirstChild; child != nil; child = child.NextSibling { 252 walk(child) 253 } 254 } 255 256 walk(root) 257 258 for _, node := range toRemove { 259 if node.Parent != nil { 260 node.Parent.RemoveChild(node) 261 } 262 } 263} 264 265// extractTextContent extracts cleaned text from a node. 266func (e *HeuristicExtractor) extractTextContent(node *html.Node) string { 267 if node == nil { 268 return "" 269 } 270 271 var buf strings.Builder 272 e.extractTextRecursive(node, &buf) 273 274 text := buf.String() 275 text = normalizeWhitespace(text) 276 text = strings.TrimSpace(text) 277 278 return text 279} 280 281// extractTextRecursive recursively extracts text with basic formatting. 282func (e *HeuristicExtractor) extractTextRecursive(node *html.Node, buf *strings.Builder) { 283 if node == nil { 284 return 285 } 286 287 if node.Type == html.TextNode { 288 buf.WriteString(node.Data) 289 return 290 } 291 292 if node.Type == html.ElementNode { 293 tag := strings.ToLower(node.Data) 294 295 if e.isBlockElement(tag) && buf.Len() > 0 { 296 buf.WriteString("\n\n") 297 } 298 299 if tag == "li" { 300 buf.WriteString("\n• ") 301 } 302 303 for child := node.FirstChild; child != nil; child = child.NextSibling { 304 e.extractTextRecursive(child, buf) 305 } 306 307 if e.isBlockElement(tag) { 308 buf.WriteString("\n") 309 } 310 } 311} 312 313// isBlockElement returns true for block-level HTML elements. 314func (e *HeuristicExtractor) isBlockElement(tagName string) bool { 315 blockElements := map[string]bool{ 316 "p": true, 317 "div": true, 318 "article": true, 319 "section": true, 320 "h1": true, 321 "h2": true, 322 "h3": true, 323 "h4": true, 324 "h5": true, 325 "h6": true, 326 "blockquote": true, 327 "pre": true, 328 "ul": true, 329 "ol": true, 330 "table": true, 331 "tr": true, 332 "td": true, 333 "th": true, 334 } 335 336 return blockElements[tagName] 337} 338 339// CompareWithXPath compares heuristic extraction with XPath-based extraction. 340func (e *HeuristicExtractor) CompareWithXPath(doc *html.Node, xpathNode *html.Node) *ExtractionResult { 341 if doc == nil { 342 return nil 343 } 344 345 heuristicResult := e.ExtractContent(doc) 346 if heuristicResult == nil { 347 heuristicResult = &ExtractionResult{ 348 ExtractionMethod: "heuristic", 349 Confidence: 0.0, 350 } 351 } 352 353 if xpathNode == nil { 354 return heuristicResult 355 } 356 357 xpathContent := e.extractTextContent(xpathNode) 358 xpathLen := len(xpathContent) 359 heuristicLen := len(heuristicResult.Content) 360 361 similarity := e.calculateSimilarity(xpathContent, heuristicResult.Content) 362 363 if similarity > 0.8 { 364 heuristicResult.Confidence = 0.95 365 heuristicResult.ExtractionMethod = "dual-validated" 366 return heuristicResult 367 } else if float64(xpathLen) > float64(heuristicLen)*1.5 { 368 return &ExtractionResult{ 369 Content: xpathContent, 370 Confidence: 0.85, 371 ExtractionMethod: "xpath-preferred", 372 } 373 } else if float64(heuristicLen) > float64(xpathLen)*1.5 { 374 heuristicResult.Confidence = 0.80 375 heuristicResult.ExtractionMethod = "heuristic-preferred" 376 return heuristicResult 377 } else { 378 heuristicResult.Confidence = 0.70 379 heuristicResult.ExtractionMethod = "heuristic-fallback" 380 return heuristicResult 381 } 382} 383 384// calculateSimilarity estimates content similarity (simple ratio of common words). 385func (e *HeuristicExtractor) calculateSimilarity(text1, text2 string) float64 { 386 if len(text1) == 0 || len(text2) == 0 { 387 if len(text1) == 0 && len(text2) == 0 { 388 return 1.0 389 } 390 return 0.0 391 } 392 393 words1 := strings.Fields(strings.ToLower(text1)) 394 words2 := strings.Fields(strings.ToLower(text2)) 395 396 if len(words1) == 0 || len(words2) == 0 { 397 return 0.0 398 } 399 400 freq1 := make(map[string]int) 401 freq2 := make(map[string]int) 402 403 for _, word := range words1 { 404 freq1[word]++ 405 } 406 407 for _, word := range words2 { 408 freq2[word]++ 409 } 410 411 common := 0 412 for word := range freq1 { 413 if freq2[word] > 0 { 414 common++ 415 } 416 } 417 418 union := len(freq1) + len(freq2) - common 419 if union == 0 { 420 return 0.0 421 } 422 423 return float64(common) / float64(union) 424} 425 426// ExtractWithSemanticHTML attempts extraction using semantic HTML5 elements first. 427// Falls back to heuristic scoring if semantic elements aren't found. 428func (e *HeuristicExtractor) ExtractWithSemanticHTML(doc *html.Node) *ExtractionResult { 429 if doc == nil { 430 return nil 431 } 432 433 articleNode := htmlquery.FindOne(doc, "//article") 434 if articleNode != nil { 435 content := e.extractTextContent(articleNode) 436 if len(content) > e.scorer.minContentLength { 437 return &ExtractionResult{ 438 Content: content, 439 Confidence: 0.90, 440 ExtractionMethod: "semantic-html", 441 } 442 } 443 } 444 445 mainNode := htmlquery.FindOne(doc, "//main") 446 if mainNode != nil { 447 content := e.extractTextContent(mainNode) 448 if len(content) > e.scorer.minContentLength { 449 return &ExtractionResult{ 450 Content: content, 451 Confidence: 0.88, 452 ExtractionMethod: "semantic-html", 453 } 454 } 455 } 456 457 return e.ExtractContent(doc) 458}