cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
at main 389 lines 10 kB view raw
1package articles 2 3import ( 4 "math" 5 "regexp" 6 "strings" 7 8 "golang.org/x/net/html" 9) 10 11// ContentScore represents the score and metadata for a content node. 12type ContentScore struct { 13 Node *html.Node 14 Score float64 15 TextLength int 16 LinkDensity float64 17 ParagraphCount int 18 AncestorDepth int 19 ConfidenceLevel float64 20} 21 22// Scorer implements Readability-style heuristic scoring for content extraction. 23type Scorer struct { 24 linkDensityWeight float64 25 classWeightPositive float64 26 classWeightNegative float64 27 paragraphWeight float64 28 ancestorDecayFactor float64 29 minContentLength int 30 minScore float64 31 positivePattern *regexp.Regexp 32 negativePattern *regexp.Regexp 33 unlikelyPattern *regexp.Regexp 34} 35 36// NewScorer creates a new Scorer with default Readability.js-inspired weights. 37func NewScorer() *Scorer { 38 return &Scorer{ 39 linkDensityWeight: -1.0, 40 classWeightPositive: 25.0, 41 classWeightNegative: -25.0, 42 paragraphWeight: 1.0, 43 ancestorDecayFactor: 0.5, 44 minContentLength: 140, 45 minScore: 20.0, 46 47 positivePattern: regexp.MustCompile(`(?i)(article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story)`), 48 negativePattern: regexp.MustCompile(`(?i)(combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|ad-|advertisement|breadcrumb|hidden|nav|menu|header)`), 49 unlikelyPattern: regexp.MustCompile(`(?i)(banner|cookie|popup|modal)`), 50 } 51} 52 53// ScoreNode calculates a content score for the given node based on multiple heuristics. 54// This implements the core Readability scoring algorithm. 55func (s *Scorer) ScoreNode(node *html.Node) *ContentScore { 56 if node == nil || node.Type != html.ElementNode { 57 return nil 58 } 59 60 score := &ContentScore{ 61 Node: node, 62 Score: 0.0, 63 AncestorDepth: s.calculateDepth(node), 64 } 65 66 score.Score = s.getTagScore(node.Data) 67 score.Score += s.getClassIdScore(node) 68 69 score.TextLength = s.calculateTextLength(node) 70 score.LinkDensity = s.calculateLinkDensity(node) 71 score.ParagraphCount = s.countParagraphs(node) 72 73 score.Score += score.LinkDensity * s.linkDensityWeight 74 score.Score += float64(score.ParagraphCount) * s.paragraphWeight 75 score.Score += s.getTextLengthScore(score.TextLength) 76 77 score.ConfidenceLevel = s.calculateConfidence(score) 78 return score 79} 80 81// getTagScore returns a base score based on the HTML tag type. 82// Some tags are more likely to contain main content than others. 83func (s *Scorer) getTagScore(tagName string) float64 { 84 switch strings.ToLower(tagName) { 85 case "article": 86 return 30.0 87 case "section": 88 return 15.0 89 case "div": 90 return 5.0 91 case "main": 92 return 40.0 93 case "p": 94 return 3.0 95 case "pre", "td", "blockquote": 96 return 3.0 97 case "address", "ol", "ul", "dl", "dd", "dt", "li", "form": 98 return -3.0 99 case "h1", "h2", "h3", "h4", "h5", "h6", "th": 100 return -5.0 101 default: 102 return 0.0 103 } 104} 105 106// getClassIdScore analyzes class and ID attributes for positive/negative indicators. 107// Returns a positive score for content-like names, negative for navigation/ads. 108func (s *Scorer) getClassIdScore(node *html.Node) float64 { 109 score := 0.0 110 classID := s.getClassAndID(node) 111 112 if classID == "" { 113 return 0.0 114 } 115 116 if s.unlikelyPattern.MatchString(classID) { 117 return -50.0 118 } 119 120 if s.negativePattern.MatchString(classID) { 121 score += s.classWeightNegative 122 } 123 124 if s.positivePattern.MatchString(classID) { 125 score += s.classWeightPositive 126 } 127 128 return score 129} 130 131// getClassAndID concatenates class and ID attributes for pattern matching. 132func (s *Scorer) getClassAndID(node *html.Node) string { 133 var parts []string 134 135 for _, attr := range node.Attr { 136 if attr.Key == "class" || attr.Key == "id" { 137 parts = append(parts, attr.Val) 138 } 139 } 140 141 return strings.Join(parts, " ") 142} 143 144// calculateTextLength returns the total text length within the node. 145func (s *Scorer) calculateTextLength(node *html.Node) int { 146 text := s.getInnerText(node) 147 return len(strings.TrimSpace(text)) 148} 149 150// calculateLinkDensity calculates the ratio of link text to total text. 151// Higher link density indicates navigation or related links, not main content. 152func (s *Scorer) calculateLinkDensity(node *html.Node) float64 { 153 totalText := s.getInnerText(node) 154 linkText := s.getLinkText(node) 155 156 totalLen := len(strings.TrimSpace(totalText)) 157 linkLen := len(strings.TrimSpace(linkText)) 158 159 if totalLen == 0 { 160 return 0.0 161 } 162 163 return float64(linkLen) / float64(totalLen) 164} 165 166// getInnerText extracts all text content from a node and its descendants. 167func (s *Scorer) getInnerText(node *html.Node) string { 168 var buf strings.Builder 169 s.extractText(node, &buf) 170 return buf.String() 171} 172 173// extractText recursively extracts text from a node tree. 174func (s *Scorer) extractText(node *html.Node, buf *strings.Builder) { 175 if node == nil { 176 return 177 } 178 179 if node.Type == html.TextNode { 180 buf.WriteString(node.Data) 181 buf.WriteString(" ") 182 return 183 } 184 185 if node.Type == html.ElementNode { 186 tag := strings.ToLower(node.Data) 187 if tag == "script" || tag == "style" || tag == "noscript" { 188 return 189 } 190 } 191 192 for child := node.FirstChild; child != nil; child = child.NextSibling { 193 s.extractText(child, buf) 194 } 195} 196 197// getLinkText extracts text from anchor tags only. 198func (s *Scorer) getLinkText(node *html.Node) string { 199 var buf strings.Builder 200 s.extractLinkText(node, &buf) 201 return buf.String() 202} 203 204// extractLinkText recursively extracts text from anchor tags. 205func (s *Scorer) extractLinkText(node *html.Node, buf *strings.Builder) { 206 if node == nil { 207 return 208 } 209 210 if node.Type == html.ElementNode && strings.ToLower(node.Data) == "a" { 211 s.extractText(node, buf) 212 return 213 } 214 215 for child := node.FirstChild; child != nil; child = child.NextSibling { 216 s.extractLinkText(child, buf) 217 } 218} 219 220// countParagraphs counts paragraph elements within the node. 221func (s *Scorer) countParagraphs(node *html.Node) int { 222 count := 0 223 s.walkParagraphs(node, &count) 224 return count 225} 226 227// walkParagraphs recursively counts paragraph elements. 228func (s *Scorer) walkParagraphs(node *html.Node, count *int) { 229 if node == nil { 230 return 231 } 232 233 if node.Type == html.ElementNode && strings.ToLower(node.Data) == "p" { 234 *count++ 235 } 236 237 for child := node.FirstChild; child != nil; child = child.NextSibling { 238 s.walkParagraphs(child, count) 239 } 240} 241 242// getTextLengthScore provides a bonus for nodes with substantial text content. 243func (s *Scorer) getTextLengthScore(textLen int) float64 { 244 if textLen < 25 { 245 return 0.0 246 } 247 return math.Log10(float64(textLen)) * 2.0 248} 249 250// calculateDepth calculates how deep in the DOM tree this node is. 251func (s *Scorer) calculateDepth(node *html.Node) int { 252 depth := 0 253 for n := node.Parent; n != nil; n = n.Parent { 254 depth++ 255 } 256 return depth 257} 258 259// ScoreAncestors propagates scores up the DOM tree with decay. 260// This implements the Readability algorithm's ancestor scoring. 261func (s *Scorer) ScoreAncestors(scores map[*html.Node]*ContentScore, node *html.Node, baseScore float64) { 262 if node == nil || baseScore <= 0 { 263 return 264 } 265 266 currentScore := baseScore 267 level := 0 268 269 for parent := node.Parent; parent != nil && level < 5; parent = parent.Parent { 270 if parent.Type != html.ElementNode { 271 continue 272 } 273 274 if _, exists := scores[parent]; !exists { 275 scores[parent] = s.ScoreNode(parent) 276 if scores[parent] == nil { 277 continue 278 } 279 } 280 281 decayedScore := currentScore * math.Pow(s.ancestorDecayFactor, float64(level+1)) 282 scores[parent].Score += decayedScore 283 level++ 284 } 285} 286 287// FindTopCandidates identifies the N highest-scoring content candidates. 288func (s *Scorer) FindTopCandidates(root *html.Node, n int) []*ContentScore { 289 if root == nil || n <= 0 { 290 return nil 291 } 292 293 scores := make(map[*html.Node]*ContentScore) 294 s.scoreTree(root, scores) 295 296 var candidates []*ContentScore 297 for _, score := range scores { 298 if score.Score >= s.minScore && score.TextLength >= s.minContentLength { 299 candidates = append(candidates, score) 300 } 301 } 302 303 for i := 0; i < len(candidates); i++ { 304 for j := i + 1; j < len(candidates); j++ { 305 if candidates[j].Score > candidates[i].Score { 306 candidates[i], candidates[j] = candidates[j], candidates[i] 307 } 308 } 309 } 310 311 if len(candidates) > n { 312 candidates = candidates[:n] 313 } 314 315 return candidates 316} 317 318// scoreTree recursively scores all nodes in the tree. 319func (s *Scorer) scoreTree(node *html.Node, scores map[*html.Node]*ContentScore) { 320 if node == nil { 321 return 322 } 323 324 if node.Type == html.ElementNode { 325 tag := strings.ToLower(node.Data) 326 if tag != "script" && tag != "style" && tag != "noscript" { 327 score := s.ScoreNode(node) 328 if score != nil && score.Score > 0 { 329 scores[node] = score 330 s.ScoreAncestors(scores, node, score.Score) 331 } 332 } 333 } 334 335 for child := node.FirstChild; child != nil; child = child.NextSibling { 336 s.scoreTree(child, scores) 337 } 338} 339 340// calculateConfidence estimates how confident we are in this content selection (between 0 & 1). 341func (s *Scorer) calculateConfidence(score *ContentScore) float64 { 342 if score == nil { 343 return 0.0 344 } 345 346 confidence := 0.0 347 348 if score.Score > s.minScore*2 { 349 confidence += 0.3 350 } else if score.Score > s.minScore { 351 confidence += 0.15 352 } 353 354 if score.TextLength > s.minContentLength*3 { 355 confidence += 0.3 356 } else if score.TextLength > s.minContentLength { 357 confidence += 0.15 358 } 359 360 if score.LinkDensity < 0.2 { 361 confidence += 0.2 362 } else if score.LinkDensity < 0.4 { 363 confidence += 0.1 364 } 365 366 if score.ParagraphCount >= 3 { 367 confidence += 0.2 368 } else if score.ParagraphCount >= 1 { 369 confidence += 0.1 370 } 371 372 if confidence > 1.0 { 373 confidence = 1.0 374 } 375 376 return confidence 377} 378 379// IsProbablyReadable determines if a document is likely to have extractable content. 380// This is inspired by Readability.js's isProbablyReaderable function. 381func (s *Scorer) IsProbablyReadable(doc *html.Node) bool { 382 if doc == nil { 383 return false 384 } 385 386 paragraphCount := s.countParagraphs(doc) 387 textLength := s.calculateTextLength(doc) 388 return paragraphCount >= 3 && textLength >= s.minContentLength 389}