cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm
leaflet
readability
golang
1package articles
2
3import (
4 "math"
5 "regexp"
6 "strings"
7
8 "golang.org/x/net/html"
9)
10
11// ContentScore represents the score and metadata for a content node.
12type ContentScore struct {
13 Node *html.Node
14 Score float64
15 TextLength int
16 LinkDensity float64
17 ParagraphCount int
18 AncestorDepth int
19 ConfidenceLevel float64
20}
21
22// Scorer implements Readability-style heuristic scoring for content extraction.
23type Scorer struct {
24 linkDensityWeight float64
25 classWeightPositive float64
26 classWeightNegative float64
27 paragraphWeight float64
28 ancestorDecayFactor float64
29 minContentLength int
30 minScore float64
31 positivePattern *regexp.Regexp
32 negativePattern *regexp.Regexp
33 unlikelyPattern *regexp.Regexp
34}
35
36// NewScorer creates a new Scorer with default Readability.js-inspired weights.
37func NewScorer() *Scorer {
38 return &Scorer{
39 linkDensityWeight: -1.0,
40 classWeightPositive: 25.0,
41 classWeightNegative: -25.0,
42 paragraphWeight: 1.0,
43 ancestorDecayFactor: 0.5,
44 minContentLength: 140,
45 minScore: 20.0,
46
47 positivePattern: regexp.MustCompile(`(?i)(article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story)`),
48 negativePattern: regexp.MustCompile(`(?i)(combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|ad-|advertisement|breadcrumb|hidden|nav|menu|header)`),
49 unlikelyPattern: regexp.MustCompile(`(?i)(banner|cookie|popup|modal)`),
50 }
51}
52
53// ScoreNode calculates a content score for the given node based on multiple heuristics.
54// This implements the core Readability scoring algorithm.
55func (s *Scorer) ScoreNode(node *html.Node) *ContentScore {
56 if node == nil || node.Type != html.ElementNode {
57 return nil
58 }
59
60 score := &ContentScore{
61 Node: node,
62 Score: 0.0,
63 AncestorDepth: s.calculateDepth(node),
64 }
65
66 score.Score = s.getTagScore(node.Data)
67 score.Score += s.getClassIdScore(node)
68
69 score.TextLength = s.calculateTextLength(node)
70 score.LinkDensity = s.calculateLinkDensity(node)
71 score.ParagraphCount = s.countParagraphs(node)
72
73 score.Score += score.LinkDensity * s.linkDensityWeight
74 score.Score += float64(score.ParagraphCount) * s.paragraphWeight
75 score.Score += s.getTextLengthScore(score.TextLength)
76
77 score.ConfidenceLevel = s.calculateConfidence(score)
78 return score
79}
80
81// getTagScore returns a base score based on the HTML tag type.
82// Some tags are more likely to contain main content than others.
83func (s *Scorer) getTagScore(tagName string) float64 {
84 switch strings.ToLower(tagName) {
85 case "article":
86 return 30.0
87 case "section":
88 return 15.0
89 case "div":
90 return 5.0
91 case "main":
92 return 40.0
93 case "p":
94 return 3.0
95 case "pre", "td", "blockquote":
96 return 3.0
97 case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
98 return -3.0
99 case "h1", "h2", "h3", "h4", "h5", "h6", "th":
100 return -5.0
101 default:
102 return 0.0
103 }
104}
105
106// getClassIdScore analyzes class and ID attributes for positive/negative indicators.
107// Returns a positive score for content-like names, negative for navigation/ads.
108func (s *Scorer) getClassIdScore(node *html.Node) float64 {
109 score := 0.0
110 classID := s.getClassAndID(node)
111
112 if classID == "" {
113 return 0.0
114 }
115
116 if s.unlikelyPattern.MatchString(classID) {
117 return -50.0
118 }
119
120 if s.negativePattern.MatchString(classID) {
121 score += s.classWeightNegative
122 }
123
124 if s.positivePattern.MatchString(classID) {
125 score += s.classWeightPositive
126 }
127
128 return score
129}
130
131// getClassAndID concatenates class and ID attributes for pattern matching.
132func (s *Scorer) getClassAndID(node *html.Node) string {
133 var parts []string
134
135 for _, attr := range node.Attr {
136 if attr.Key == "class" || attr.Key == "id" {
137 parts = append(parts, attr.Val)
138 }
139 }
140
141 return strings.Join(parts, " ")
142}
143
144// calculateTextLength returns the total text length within the node.
145func (s *Scorer) calculateTextLength(node *html.Node) int {
146 text := s.getInnerText(node)
147 return len(strings.TrimSpace(text))
148}
149
150// calculateLinkDensity calculates the ratio of link text to total text.
151// Higher link density indicates navigation or related links, not main content.
152func (s *Scorer) calculateLinkDensity(node *html.Node) float64 {
153 totalText := s.getInnerText(node)
154 linkText := s.getLinkText(node)
155
156 totalLen := len(strings.TrimSpace(totalText))
157 linkLen := len(strings.TrimSpace(linkText))
158
159 if totalLen == 0 {
160 return 0.0
161 }
162
163 return float64(linkLen) / float64(totalLen)
164}
165
166// getInnerText extracts all text content from a node and its descendants.
167func (s *Scorer) getInnerText(node *html.Node) string {
168 var buf strings.Builder
169 s.extractText(node, &buf)
170 return buf.String()
171}
172
173// extractText recursively extracts text from a node tree.
174func (s *Scorer) extractText(node *html.Node, buf *strings.Builder) {
175 if node == nil {
176 return
177 }
178
179 if node.Type == html.TextNode {
180 buf.WriteString(node.Data)
181 buf.WriteString(" ")
182 return
183 }
184
185 if node.Type == html.ElementNode {
186 tag := strings.ToLower(node.Data)
187 if tag == "script" || tag == "style" || tag == "noscript" {
188 return
189 }
190 }
191
192 for child := node.FirstChild; child != nil; child = child.NextSibling {
193 s.extractText(child, buf)
194 }
195}
196
197// getLinkText extracts text from anchor tags only.
198func (s *Scorer) getLinkText(node *html.Node) string {
199 var buf strings.Builder
200 s.extractLinkText(node, &buf)
201 return buf.String()
202}
203
204// extractLinkText recursively extracts text from anchor tags.
205func (s *Scorer) extractLinkText(node *html.Node, buf *strings.Builder) {
206 if node == nil {
207 return
208 }
209
210 if node.Type == html.ElementNode && strings.ToLower(node.Data) == "a" {
211 s.extractText(node, buf)
212 return
213 }
214
215 for child := node.FirstChild; child != nil; child = child.NextSibling {
216 s.extractLinkText(child, buf)
217 }
218}
219
220// countParagraphs counts paragraph elements within the node.
221func (s *Scorer) countParagraphs(node *html.Node) int {
222 count := 0
223 s.walkParagraphs(node, &count)
224 return count
225}
226
227// walkParagraphs recursively counts paragraph elements.
228func (s *Scorer) walkParagraphs(node *html.Node, count *int) {
229 if node == nil {
230 return
231 }
232
233 if node.Type == html.ElementNode && strings.ToLower(node.Data) == "p" {
234 *count++
235 }
236
237 for child := node.FirstChild; child != nil; child = child.NextSibling {
238 s.walkParagraphs(child, count)
239 }
240}
241
242// getTextLengthScore provides a bonus for nodes with substantial text content.
243func (s *Scorer) getTextLengthScore(textLen int) float64 {
244 if textLen < 25 {
245 return 0.0
246 }
247 return math.Log10(float64(textLen)) * 2.0
248}
249
250// calculateDepth calculates how deep in the DOM tree this node is.
251func (s *Scorer) calculateDepth(node *html.Node) int {
252 depth := 0
253 for n := node.Parent; n != nil; n = n.Parent {
254 depth++
255 }
256 return depth
257}
258
259// ScoreAncestors propagates scores up the DOM tree with decay.
260// This implements the Readability algorithm's ancestor scoring.
261func (s *Scorer) ScoreAncestors(scores map[*html.Node]*ContentScore, node *html.Node, baseScore float64) {
262 if node == nil || baseScore <= 0 {
263 return
264 }
265
266 currentScore := baseScore
267 level := 0
268
269 for parent := node.Parent; parent != nil && level < 5; parent = parent.Parent {
270 if parent.Type != html.ElementNode {
271 continue
272 }
273
274 if _, exists := scores[parent]; !exists {
275 scores[parent] = s.ScoreNode(parent)
276 if scores[parent] == nil {
277 continue
278 }
279 }
280
281 decayedScore := currentScore * math.Pow(s.ancestorDecayFactor, float64(level+1))
282 scores[parent].Score += decayedScore
283 level++
284 }
285}
286
287// FindTopCandidates identifies the N highest-scoring content candidates.
288func (s *Scorer) FindTopCandidates(root *html.Node, n int) []*ContentScore {
289 if root == nil || n <= 0 {
290 return nil
291 }
292
293 scores := make(map[*html.Node]*ContentScore)
294 s.scoreTree(root, scores)
295
296 var candidates []*ContentScore
297 for _, score := range scores {
298 if score.Score >= s.minScore && score.TextLength >= s.minContentLength {
299 candidates = append(candidates, score)
300 }
301 }
302
303 for i := 0; i < len(candidates); i++ {
304 for j := i + 1; j < len(candidates); j++ {
305 if candidates[j].Score > candidates[i].Score {
306 candidates[i], candidates[j] = candidates[j], candidates[i]
307 }
308 }
309 }
310
311 if len(candidates) > n {
312 candidates = candidates[:n]
313 }
314
315 return candidates
316}
317
318// scoreTree recursively scores all nodes in the tree.
319func (s *Scorer) scoreTree(node *html.Node, scores map[*html.Node]*ContentScore) {
320 if node == nil {
321 return
322 }
323
324 if node.Type == html.ElementNode {
325 tag := strings.ToLower(node.Data)
326 if tag != "script" && tag != "style" && tag != "noscript" {
327 score := s.ScoreNode(node)
328 if score != nil && score.Score > 0 {
329 scores[node] = score
330 s.ScoreAncestors(scores, node, score.Score)
331 }
332 }
333 }
334
335 for child := node.FirstChild; child != nil; child = child.NextSibling {
336 s.scoreTree(child, scores)
337 }
338}
339
340// calculateConfidence estimates how confident we are in this content selection (between 0 & 1).
341func (s *Scorer) calculateConfidence(score *ContentScore) float64 {
342 if score == nil {
343 return 0.0
344 }
345
346 confidence := 0.0
347
348 if score.Score > s.minScore*2 {
349 confidence += 0.3
350 } else if score.Score > s.minScore {
351 confidence += 0.15
352 }
353
354 if score.TextLength > s.minContentLength*3 {
355 confidence += 0.3
356 } else if score.TextLength > s.minContentLength {
357 confidence += 0.15
358 }
359
360 if score.LinkDensity < 0.2 {
361 confidence += 0.2
362 } else if score.LinkDensity < 0.4 {
363 confidence += 0.1
364 }
365
366 if score.ParagraphCount >= 3 {
367 confidence += 0.2
368 } else if score.ParagraphCount >= 1 {
369 confidence += 0.1
370 }
371
372 if confidence > 1.0 {
373 confidence = 1.0
374 }
375
376 return confidence
377}
378
379// IsProbablyReadable determines if a document is likely to have extractable content.
380// This is inspired by Readability.js's isProbablyReaderable function.
381func (s *Scorer) IsProbablyReadable(doc *html.Node) bool {
382 if doc == nil {
383 return false
384 }
385
386 paragraphCount := s.countParagraphs(doc)
387 textLength := s.calculateTextLength(doc)
388 return paragraphCount >= 3 && textLength >= s.minContentLength
389}