commit eb64822d479b2ccef8b307a56d5586800088ad28 · desertthunder.dev/noteleaf

+458

internal/articles/heuristics.go

··· 1 + package articles 2 + 3 + import ( 4 + "strings" 5 + 6 + "github.com/antchfx/htmlquery" 7 + "golang.org/x/net/html" 8 + ) 9 + 10 + // ExtractionResult contains the results of heuristic content extraction. 11 + type ExtractionResult struct { 12 + Content string 13 + Title string 14 + Author string 15 + PublishedDate string 16 + SiteName string 17 + Language string 18 + Confidence float64 19 + ExtractionMethod string // "heuristic" or "xpath" or "dual" 20 + } 21 + 22 + // HeuristicExtractor implements Readability-style content extraction. 23 + type HeuristicExtractor struct { 24 + scorer *Scorer 25 + } 26 + 27 + // NewHeuristicExtractor creates a new extractor with default scoring. 28 + func NewHeuristicExtractor() *HeuristicExtractor { 29 + return &HeuristicExtractor{ 30 + scorer: NewScorer(), 31 + } 32 + } 33 + 34 + // ExtractContent performs heuristic-based content extraction from an HTML document. 35 + func (e *HeuristicExtractor) ExtractContent(doc *html.Node) *ExtractionResult { 36 + if doc == nil { 37 + return nil 38 + } 39 + 40 + if !e.scorer.IsProbablyReadable(doc) { 41 + return &ExtractionResult{ 42 + Confidence: 0.1, 43 + ExtractionMethod: "heuristic", 44 + } 45 + } 46 + 47 + cleaned := e.cleanDocument(doc) 48 + candidates := e.scorer.FindTopCandidates(cleaned, 5) 49 + if len(candidates) == 0 { 50 + return &ExtractionResult{ 51 + Confidence: 0.2, 52 + ExtractionMethod: "heuristic", 53 + } 54 + } 55 + 56 + topCandidate := candidates[0] 57 + content := e.extractTextContent(topCandidate.Node) 58 + result := &ExtractionResult{ 59 + Content: content, 60 + Confidence: topCandidate.ConfidenceLevel, 61 + ExtractionMethod: "heuristic", 62 + } 63 + 64 + return result 65 + } 66 + 67 + // cleanDocument removes unwanted elements and prepares the document for extraction. 68 + func (e *HeuristicExtractor) cleanDocument(doc *html.Node) *html.Node { 69 + 70 + cloned := e.cloneNode(doc) 71 + 72 + e.removeElements(cloned, "script", "style", "noscript", "iframe", "embed", "object") 73 + e.removeHiddenElements(cloned) 74 + e.removeUnlikelyCandidates(cloned) 75 + e.removeHighLinkDensityElements(cloned) 76 + 77 + return cloned 78 + } 79 + 80 + // cloneNode creates a deep copy of an HTML node tree. 81 + func (e *HeuristicExtractor) cloneNode(node *html.Node) *html.Node { 82 + if node == nil { 83 + return nil 84 + } 85 + 86 + clone := &html.Node{ 87 + Type: node.Type, 88 + Data: node.Data, 89 + DataAtom: node.DataAtom, 90 + Namespace: node.Namespace, 91 + Attr: make([]html.Attribute, len(node.Attr)), 92 + } 93 + 94 + copy(clone.Attr, node.Attr) 95 + 96 + for child := node.FirstChild; child != nil; child = child.NextSibling { 97 + clonedChild := e.cloneNode(child) 98 + if clonedChild != nil { 99 + clone.AppendChild(clonedChild) 100 + } 101 + } 102 + 103 + return clone 104 + } 105 + 106 + // removeElements removes all elements with the specified tag names. 107 + func (e *HeuristicExtractor) removeElements(root *html.Node, tagNames ...string) { 108 + if root == nil { 109 + return 110 + } 111 + 112 + tagMap := make(map[string]bool) 113 + for _, tag := range tagNames { 114 + tagMap[strings.ToLower(tag)] = true 115 + } 116 + 117 + var toRemove []*html.Node 118 + 119 + var walk func(*html.Node) 120 + walk = func(node *html.Node) { 121 + if node.Type == html.ElementNode { 122 + if tagMap[strings.ToLower(node.Data)] { 123 + toRemove = append(toRemove, node) 124 + return 125 + } 126 + } 127 + 128 + for child := node.FirstChild; child != nil; child = child.NextSibling { 129 + walk(child) 130 + } 131 + } 132 + 133 + walk(root) 134 + 135 + for _, node := range toRemove { 136 + if node.Parent != nil { 137 + node.Parent.RemoveChild(node) 138 + } 139 + } 140 + } 141 + 142 + // removeHiddenElements removes elements that are hidden via CSS or attributes. 143 + func (e *HeuristicExtractor) removeHiddenElements(root *html.Node) { 144 + if root == nil { 145 + return 146 + } 147 + 148 + var toRemove []*html.Node 149 + 150 + var walk func(*html.Node) 151 + walk = func(node *html.Node) { 152 + if node.Type == html.ElementNode { 153 + for _, attr := range node.Attr { 154 + if attr.Key == "hidden" { 155 + toRemove = append(toRemove, node) 156 + return 157 + } 158 + 159 + if attr.Key == "style" { 160 + style := strings.ToLower(attr.Val) 161 + if strings.Contains(style, "display:none") || strings.Contains(style, "display: none") || 162 + strings.Contains(style, "visibility:hidden") || strings.Contains(style, "visibility: hidden") { 163 + toRemove = append(toRemove, node) 164 + return 165 + } 166 + } 167 + 168 + if attr.Key == "aria-hidden" && attr.Val == "true" { 169 + toRemove = append(toRemove, node) 170 + return 171 + } 172 + } 173 + } 174 + 175 + for child := node.FirstChild; child != nil; child = child.NextSibling { 176 + walk(child) 177 + } 178 + } 179 + 180 + walk(root) 181 + 182 + for _, node := range toRemove { 183 + if node.Parent != nil { 184 + node.Parent.RemoveChild(node) 185 + } 186 + } 187 + } 188 + 189 + // removeUnlikelyCandidates removes elements that are unlikely to be main content. 190 + func (e *HeuristicExtractor) removeUnlikelyCandidates(root *html.Node) { 191 + if root == nil { 192 + return 193 + } 194 + 195 + var toRemove []*html.Node 196 + 197 + var walk func(*html.Node) 198 + walk = func(node *html.Node) { 199 + if node.Type == html.ElementNode { 200 + score := e.scorer.getClassIdScore(node) 201 + 202 + if score < -40 { 203 + toRemove = append(toRemove, node) 204 + return 205 + } 206 + } 207 + 208 + for child := node.FirstChild; child != nil; child = child.NextSibling { 209 + walk(child) 210 + } 211 + } 212 + 213 + walk(root) 214 + 215 + for _, node := range toRemove { 216 + if node.Parent != nil { 217 + node.Parent.RemoveChild(node) 218 + } 219 + } 220 + } 221 + 222 + // removeHighLinkDensityElements removes elements with excessive link density. 223 + func (e *HeuristicExtractor) removeHighLinkDensityElements(root *html.Node) { 224 + if root == nil { 225 + return 226 + } 227 + 228 + const linkDensityThreshold = 0.75 229 + 230 + var toRemove []*html.Node 231 + 232 + var walk func(*html.Node) 233 + walk = func(node *html.Node) { 234 + if node.Type == html.ElementNode { 235 + if strings.ToLower(node.Data) == "a" { 236 + for child := node.FirstChild; child != nil; child = child.NextSibling { 237 + walk(child) 238 + } 239 + return 240 + } 241 + 242 + density := e.scorer.calculateLinkDensity(node) 243 + textLen := e.scorer.calculateTextLength(node) 244 + 245 + if density > linkDensityThreshold && textLen < 500 { 246 + toRemove = append(toRemove, node) 247 + return 248 + } 249 + } 250 + 251 + for child := node.FirstChild; child != nil; child = child.NextSibling { 252 + walk(child) 253 + } 254 + } 255 + 256 + walk(root) 257 + 258 + for _, node := range toRemove { 259 + if node.Parent != nil { 260 + node.Parent.RemoveChild(node) 261 + } 262 + } 263 + } 264 + 265 + // extractTextContent extracts cleaned text from a node. 266 + func (e *HeuristicExtractor) extractTextContent(node *html.Node) string { 267 + if node == nil { 268 + return "" 269 + } 270 + 271 + var buf strings.Builder 272 + e.extractTextRecursive(node, &buf) 273 + 274 + text := buf.String() 275 + text = normalizeWhitespace(text) 276 + text = strings.TrimSpace(text) 277 + 278 + return text 279 + } 280 + 281 + // extractTextRecursive recursively extracts text with basic formatting. 282 + func (e *HeuristicExtractor) extractTextRecursive(node *html.Node, buf *strings.Builder) { 283 + if node == nil { 284 + return 285 + } 286 + 287 + if node.Type == html.TextNode { 288 + buf.WriteString(node.Data) 289 + return 290 + } 291 + 292 + if node.Type == html.ElementNode { 293 + tag := strings.ToLower(node.Data) 294 + 295 + if e.isBlockElement(tag) && buf.Len() > 0 { 296 + buf.WriteString("\n\n") 297 + } 298 + 299 + if tag == "li" { 300 + buf.WriteString("\n• ") 301 + } 302 + 303 + for child := node.FirstChild; child != nil; child = child.NextSibling { 304 + e.extractTextRecursive(child, buf) 305 + } 306 + 307 + if e.isBlockElement(tag) { 308 + buf.WriteString("\n") 309 + } 310 + } 311 + } 312 + 313 + // isBlockElement returns true for block-level HTML elements. 314 + func (e *HeuristicExtractor) isBlockElement(tagName string) bool { 315 + blockElements := map[string]bool{ 316 + "p": true, 317 + "div": true, 318 + "article": true, 319 + "section": true, 320 + "h1": true, 321 + "h2": true, 322 + "h3": true, 323 + "h4": true, 324 + "h5": true, 325 + "h6": true, 326 + "blockquote": true, 327 + "pre": true, 328 + "ul": true, 329 + "ol": true, 330 + "table": true, 331 + "tr": true, 332 + "td": true, 333 + "th": true, 334 + } 335 + 336 + return blockElements[tagName] 337 + } 338 + 339 + // CompareWithXPath compares heuristic extraction with XPath-based extraction. 340 + func (e *HeuristicExtractor) CompareWithXPath(doc *html.Node, xpathNode *html.Node) *ExtractionResult { 341 + if doc == nil { 342 + return nil 343 + } 344 + 345 + heuristicResult := e.ExtractContent(doc) 346 + if heuristicResult == nil { 347 + heuristicResult = &ExtractionResult{ 348 + ExtractionMethod: "heuristic", 349 + Confidence: 0.0, 350 + } 351 + } 352 + 353 + if xpathNode == nil { 354 + return heuristicResult 355 + } 356 + 357 + xpathContent := e.extractTextContent(xpathNode) 358 + xpathLen := len(xpathContent) 359 + heuristicLen := len(heuristicResult.Content) 360 + 361 + similarity := e.calculateSimilarity(xpathContent, heuristicResult.Content) 362 + 363 + if similarity > 0.8 { 364 + heuristicResult.Confidence = 0.95 365 + heuristicResult.ExtractionMethod = "dual-validated" 366 + return heuristicResult 367 + } else if float64(xpathLen) > float64(heuristicLen)*1.5 { 368 + return &ExtractionResult{ 369 + Content: xpathContent, 370 + Confidence: 0.85, 371 + ExtractionMethod: "xpath-preferred", 372 + } 373 + } else if float64(heuristicLen) > float64(xpathLen)*1.5 { 374 + heuristicResult.Confidence = 0.80 375 + heuristicResult.ExtractionMethod = "heuristic-preferred" 376 + return heuristicResult 377 + } else { 378 + heuristicResult.Confidence = 0.70 379 + heuristicResult.ExtractionMethod = "heuristic-fallback" 380 + return heuristicResult 381 + } 382 + } 383 + 384 + // calculateSimilarity estimates content similarity (simple ratio of common words). 385 + func (e *HeuristicExtractor) calculateSimilarity(text1, text2 string) float64 { 386 + if len(text1) == 0 || len(text2) == 0 { 387 + if len(text1) == 0 && len(text2) == 0 { 388 + return 1.0 389 + } 390 + return 0.0 391 + } 392 + 393 + words1 := strings.Fields(strings.ToLower(text1)) 394 + words2 := strings.Fields(strings.ToLower(text2)) 395 + 396 + if len(words1) == 0 || len(words2) == 0 { 397 + return 0.0 398 + } 399 + 400 + freq1 := make(map[string]int) 401 + freq2 := make(map[string]int) 402 + 403 + for _, word := range words1 { 404 + freq1[word]++ 405 + } 406 + 407 + for _, word := range words2 { 408 + freq2[word]++ 409 + } 410 + 411 + common := 0 412 + for word := range freq1 { 413 + if freq2[word] > 0 { 414 + common++ 415 + } 416 + } 417 + 418 + union := len(freq1) + len(freq2) - common 419 + if union == 0 { 420 + return 0.0 421 + } 422 + 423 + return float64(common) / float64(union) 424 + } 425 + 426 + // ExtractWithSemanticHTML attempts extraction using semantic HTML5 elements first. 427 + // Falls back to heuristic scoring if semantic elements aren't found. 428 + func (e *HeuristicExtractor) ExtractWithSemanticHTML(doc *html.Node) *ExtractionResult { 429 + if doc == nil { 430 + return nil 431 + } 432 + 433 + articleNode := htmlquery.FindOne(doc, "//article") 434 + if articleNode != nil { 435 + content := e.extractTextContent(articleNode) 436 + if len(content) > e.scorer.minContentLength { 437 + return &ExtractionResult{ 438 + Content: content, 439 + Confidence: 0.90, 440 + ExtractionMethod: "semantic-html", 441 + } 442 + } 443 + } 444 + 445 + mainNode := htmlquery.FindOne(doc, "//main") 446 + if mainNode != nil { 447 + content := e.extractTextContent(mainNode) 448 + if len(content) > e.scorer.minContentLength { 449 + return &ExtractionResult{ 450 + Content: content, 451 + Confidence: 0.88, 452 + ExtractionMethod: "semantic-html", 453 + } 454 + } 455 + } 456 + 457 + return e.ExtractContent(doc) 458 + }

+443

internal/articles/heuristics_test.go

··· 1 + package articles 2 + 3 + import ( 4 + "strings" 5 + "testing" 6 + 7 + "golang.org/x/net/html" 8 + ) 9 + 10 + func TestHeuristicExtractor(t *testing.T) { 11 + t.Run("NewHeuristicExtractor", func(t *testing.T) { 12 + t.Run("creates extractor with scorer", func(t *testing.T) { 13 + extractor := NewHeuristicExtractor() 14 + 15 + if extractor == nil { 16 + t.Fatal("Expected extractor to be created, got nil") 17 + } 18 + 19 + if extractor.scorer == nil { 20 + t.Error("Expected extractor to have scorer") 21 + } 22 + }) 23 + }) 24 + 25 + t.Run("ExtractContent", func(t *testing.T) { 26 + extractor := NewHeuristicExtractor() 27 + 28 + t.Run("extracts content from article", func(t *testing.T) { 29 + htmlStr := `<html><body> 30 + <article class="main-content"> 31 + <p>This is the first paragraph of the article with substantial content.</p> 32 + <p>This is the second paragraph with more information and details.</p> 33 + <p>And this is the third paragraph to ensure sufficient content.</p> 34 + </article> 35 + <aside class="sidebar"><a href="#">Sidebar link</a></aside> 36 + </body></html>` 37 + doc := parseHTML(htmlStr) 38 + 39 + result := extractor.ExtractContent(doc) 40 + 41 + if result == nil { 42 + t.Fatal("Expected extraction result, got nil") 43 + } 44 + 45 + if result.Content == "" { 46 + t.Error("Expected content to be extracted") 47 + } 48 + 49 + if result.Confidence == 0.0 { 50 + t.Error("Expected non-zero confidence") 51 + } 52 + 53 + if !strings.Contains(result.Content, "first paragraph") { 54 + t.Error("Expected content to contain article text") 55 + } 56 + }) 57 + 58 + t.Run("returns low confidence for unreadable document", func(t *testing.T) { 59 + htmlStr := `<html><body><div>Short</div></body></html>` 60 + doc := parseHTML(htmlStr) 61 + 62 + result := extractor.ExtractContent(doc) 63 + 64 + if result == nil { 65 + t.Fatal("Expected extraction result, got nil") 66 + } 67 + 68 + if result.Confidence > 0.3 { 69 + t.Errorf("Expected low confidence for short document, got %f", result.Confidence) 70 + } 71 + }) 72 + 73 + t.Run("returns nil for nil document", func(t *testing.T) { 74 + result := extractor.ExtractContent(nil) 75 + 76 + if result != nil { 77 + t.Error("Expected nil for nil document") 78 + } 79 + }) 80 + }) 81 + 82 + t.Run("cleanDocument", func(t *testing.T) { 83 + extractor := NewHeuristicExtractor() 84 + 85 + t.Run("removes script and style tags", func(t *testing.T) { 86 + htmlStr := `<html><body> 87 + <script>alert('test');</script> 88 + <style>.test { color: red; }</style> 89 + <p>Content</p> 90 + </body></html>` 91 + doc := parseHTML(htmlStr) 92 + 93 + cleaned := extractor.cleanDocument(doc) 94 + 95 + script := findElement(cleaned, "script") 96 + style := findElement(cleaned, "style") 97 + 98 + if script != nil { 99 + t.Error("Expected script tag to be removed") 100 + } 101 + 102 + if style != nil { 103 + t.Error("Expected style tag to be removed") 104 + } 105 + }) 106 + 107 + t.Run("removes hidden elements", func(t *testing.T) { 108 + htmlStr := `<html><body> 109 + <div style="display:none">Hidden</div> 110 + <div hidden>Also hidden</div> 111 + <p>Visible</p> 112 + </body></html>` 113 + doc := parseHTML(htmlStr) 114 + 115 + cleaned := extractor.cleanDocument(doc) 116 + 117 + // Count divs - should only have visible ones 118 + divCount := 0 119 + var countDivs func(*html.Node) 120 + countDivs = func(node *html.Node) { 121 + if node.Type == html.ElementNode && node.Data == "div" { 122 + divCount++ 123 + } 124 + for child := node.FirstChild; child != nil; child = child.NextSibling { 125 + countDivs(child) 126 + } 127 + } 128 + countDivs(cleaned) 129 + 130 + if divCount > 0 { 131 + t.Errorf("Expected hidden divs to be removed, found %d", divCount) 132 + } 133 + }) 134 + 135 + t.Run("removes high link density elements", func(t *testing.T) { 136 + htmlStr := `<html><body> 137 + <div class="links"> 138 + <a href="#">Link1</a> 139 + <a href="#">Link2</a> 140 + <a href="#">Link3</a> 141 + </div> 142 + <p>Regular paragraph with actual content that should remain.</p> 143 + </body></html>` 144 + doc := parseHTML(htmlStr) 145 + 146 + cleaned := extractor.cleanDocument(doc) 147 + 148 + p := findElement(cleaned, "p") 149 + if p == nil { 150 + t.Error("Expected paragraph to remain") 151 + } 152 + }) 153 + }) 154 + 155 + t.Run("extractTextContent", func(t *testing.T) { 156 + extractor := NewHeuristicExtractor() 157 + 158 + t.Run("extracts text with basic formatting", func(t *testing.T) { 159 + htmlStr := `<html><body><div> 160 + <p>First paragraph</p> 161 + <p>Second paragraph</p> 162 + </div></body></html>` 163 + doc := parseHTML(htmlStr) 164 + div := findElement(doc, "div") 165 + 166 + text := extractor.extractTextContent(div) 167 + 168 + if !strings.Contains(text, "First paragraph") { 169 + t.Error("Expected text to contain first paragraph") 170 + } 171 + 172 + if !strings.Contains(text, "Second paragraph") { 173 + t.Error("Expected text to contain second paragraph") 174 + } 175 + }) 176 + 177 + t.Run("formats list items with bullets", func(t *testing.T) { 178 + htmlStr := `<html><body><ul> 179 + <li>Item 1</li> 180 + <li>Item 2</li> 181 + </ul></body></html>` 182 + doc := parseHTML(htmlStr) 183 + ul := findElement(doc, "ul") 184 + 185 + text := extractor.extractTextContent(ul) 186 + 187 + if !strings.Contains(text, "•") { 188 + t.Error("Expected text to contain bullet points") 189 + } 190 + }) 191 + 192 + t.Run("returns empty string for nil node", func(t *testing.T) { 193 + text := extractor.extractTextContent(nil) 194 + 195 + if text != "" { 196 + t.Error("Expected empty string for nil node") 197 + } 198 + }) 199 + }) 200 + 201 + t.Run("CompareWithXPath", func(t *testing.T) { 202 + extractor := NewHeuristicExtractor() 203 + 204 + t.Run("high confidence when XPath and heuristics agree", func(t *testing.T) { 205 + htmlStr := `<html><body> 206 + <article> 207 + <p>This is substantial content that both methods should find.</p> 208 + <p>Another paragraph with more details and information.</p> 209 + <p>And a third paragraph for good measure and completeness.</p> 210 + </article> 211 + </body></html>` 212 + doc := parseHTML(htmlStr) 213 + article := findElement(doc, "article") 214 + 215 + result := extractor.CompareWithXPath(doc, article) 216 + 217 + if result == nil { 218 + t.Fatal("Expected result, got nil") 219 + } 220 + 221 + if result.Confidence < 0.8 { 222 + t.Errorf("Expected high confidence when methods agree, got %f", result.Confidence) 223 + } 224 + 225 + if !strings.Contains(result.ExtractionMethod, "dual") && !strings.Contains(result.ExtractionMethod, "validated") { 226 + t.Errorf("Expected dual validation method, got %s", result.ExtractionMethod) 227 + } 228 + }) 229 + 230 + t.Run("prefers XPath when it extracts more content", func(t *testing.T) { 231 + htmlStr := `<html><body> 232 + <div class="content"> 233 + <p>Short content</p> 234 + </div> 235 + <div class="more"> 236 + <p>This is additional content that XPath found but heuristics might miss.</p> 237 + <p>Even more content here to make a significant difference in length.</p> 238 + <p>And yet another paragraph to ensure XPath extraction is substantially longer.</p> 239 + </div> 240 + </body></html>` 241 + doc := parseHTML(htmlStr) 242 + 243 + // XPath would get more content 244 + body := findElement(doc, "body") 245 + 246 + result := extractor.CompareWithXPath(doc, body) 247 + 248 + if result == nil { 249 + t.Fatal("Expected result, got nil") 250 + } 251 + 252 + // Should prefer one method over the other 253 + if result.ExtractionMethod == "heuristic" { 254 + t.Errorf("Expected method preference, got %s", result.ExtractionMethod) 255 + } 256 + }) 257 + 258 + t.Run("uses heuristics when XPath node is nil", func(t *testing.T) { 259 + htmlStr := `<html><body> 260 + <article> 261 + <p>Content that heuristics should find on its own.</p> 262 + <p>Additional paragraph for sufficient content length.</p> 263 + <p>Third paragraph to meet minimum requirements.</p> 264 + </article> 265 + </body></html>` 266 + doc := parseHTML(htmlStr) 267 + 268 + result := extractor.CompareWithXPath(doc, nil) 269 + 270 + if result == nil { 271 + t.Fatal("Expected result, got nil") 272 + } 273 + 274 + if result.ExtractionMethod != "heuristic" { 275 + t.Errorf("Expected heuristic method when XPath is nil, got %s", result.ExtractionMethod) 276 + } 277 + }) 278 + 279 + t.Run("returns nil for nil document", func(t *testing.T) { 280 + result := extractor.CompareWithXPath(nil, nil) 281 + 282 + if result != nil { 283 + t.Error("Expected nil for nil document") 284 + } 285 + }) 286 + }) 287 + 288 + t.Run("calculateSimilarity", func(t *testing.T) { 289 + extractor := NewHeuristicExtractor() 290 + 291 + t.Run("returns high similarity for identical text", func(t *testing.T) { 292 + text := "This is some test content" 293 + 294 + similarity := extractor.calculateSimilarity(text, text) 295 + 296 + if similarity < 0.9 { 297 + t.Errorf("Expected high similarity for identical text, got %f", similarity) 298 + } 299 + }) 300 + 301 + t.Run("returns low similarity for different text", func(t *testing.T) { 302 + text1 := "This is the first piece of content" 303 + text2 := "Completely different words and phrases" 304 + 305 + similarity := extractor.calculateSimilarity(text1, text2) 306 + 307 + if similarity > 0.3 { 308 + t.Errorf("Expected low similarity for different text, got %f", similarity) 309 + } 310 + }) 311 + 312 + t.Run("returns zero for empty strings", func(t *testing.T) { 313 + similarity := extractor.calculateSimilarity("text", "") 314 + 315 + if similarity != 0.0 { 316 + t.Errorf("Expected zero similarity for empty string, got %f", similarity) 317 + } 318 + }) 319 + 320 + t.Run("returns one for both empty", func(t *testing.T) { 321 + similarity := extractor.calculateSimilarity("", "") 322 + 323 + if similarity != 1.0 { 324 + t.Errorf("Expected 1.0 similarity for both empty, got %f", similarity) 325 + } 326 + }) 327 + }) 328 + 329 + t.Run("ExtractWithSemanticHTML", func(t *testing.T) { 330 + extractor := NewHeuristicExtractor() 331 + 332 + t.Run("extracts from article tag", func(t *testing.T) { 333 + htmlStr := `<html><body> 334 + <nav>Navigation</nav> 335 + <article> 336 + <p>This is the main article content that should be extracted.</p> 337 + <p>Second paragraph of the article with more information.</p> 338 + <p>Third paragraph to provide sufficient content length.</p> 339 + </article> 340 + <aside>Sidebar</aside> 341 + </body></html>` 342 + doc := parseHTML(htmlStr) 343 + 344 + result := extractor.ExtractWithSemanticHTML(doc) 345 + 346 + if result == nil { 347 + t.Fatal("Expected result, got nil") 348 + } 349 + 350 + if result.ExtractionMethod != "semantic-html" { 351 + t.Errorf("Expected semantic-html method, got %s", result.ExtractionMethod) 352 + } 353 + 354 + if !strings.Contains(result.Content, "main article content") { 355 + t.Error("Expected content from article tag") 356 + } 357 + 358 + if result.Confidence < 0.85 { 359 + t.Errorf("Expected high confidence for semantic HTML, got %f", result.Confidence) 360 + } 361 + }) 362 + 363 + t.Run("extracts from main tag", func(t *testing.T) { 364 + htmlStr := `<html><body> 365 + <header>Header</header> 366 + <main> 367 + <p>This is the main content area with sufficient text.</p> 368 + <p>Additional content paragraph with more details.</p> 369 + <p>Third paragraph for completeness and length.</p> 370 + </main> 371 + <footer>Footer</footer> 372 + </body></html>` 373 + doc := parseHTML(htmlStr) 374 + 375 + result := extractor.ExtractWithSemanticHTML(doc) 376 + 377 + if result == nil { 378 + t.Fatal("Expected result, got nil") 379 + } 380 + 381 + if result.ExtractionMethod != "semantic-html" { 382 + t.Errorf("Expected semantic-html method, got %s", result.ExtractionMethod) 383 + } 384 + 385 + if !strings.Contains(result.Content, "main content area") { 386 + t.Error("Expected content from main tag") 387 + } 388 + }) 389 + 390 + t.Run("falls back to heuristics without semantic tags", func(t *testing.T) { 391 + htmlStr := `<html><body> 392 + <div class="content"> 393 + <p>Content in a regular div without semantic HTML tags.</p> 394 + <p>Second paragraph with additional information.</p> 395 + <p>Third paragraph for sufficient content.</p> 396 + </div> 397 + </body></html>` 398 + doc := parseHTML(htmlStr) 399 + 400 + result := extractor.ExtractWithSemanticHTML(doc) 401 + 402 + if result == nil { 403 + t.Fatal("Expected result, got nil") 404 + } 405 + 406 + if result.ExtractionMethod == "semantic-html" { 407 + t.Error("Should not use semantic-html method without semantic tags") 408 + } 409 + }) 410 + 411 + t.Run("returns nil for nil document", func(t *testing.T) { 412 + result := extractor.ExtractWithSemanticHTML(nil) 413 + 414 + if result != nil { 415 + t.Error("Expected nil for nil document") 416 + } 417 + }) 418 + }) 419 + 420 + t.Run("isBlockElement", func(t *testing.T) { 421 + extractor := NewHeuristicExtractor() 422 + 423 + t.Run("identifies block elements", func(t *testing.T) { 424 + blockTags := []string{"p", "div", "article", "h1", "section"} 425 + 426 + for _, tag := range blockTags { 427 + if !extractor.isBlockElement(tag) { 428 + t.Errorf("Expected %s to be a block element", tag) 429 + } 430 + } 431 + }) 432 + 433 + t.Run("identifies non-block elements", func(t *testing.T) { 434 + inlineTags := []string{"span", "a", "em", "strong", "code"} 435 + 436 + for _, tag := range inlineTags { 437 + if extractor.isBlockElement(tag) { 438 + t.Errorf("Expected %s to not be a block element", tag) 439 + } 440 + } 441 + }) 442 + }) 443 + }

+305

internal/articles/metadata.go

··· 1 + package articles 2 + 3 + import ( 4 + "encoding/json" 5 + "strings" 6 + 7 + "github.com/antchfx/htmlquery" 8 + "golang.org/x/net/html" 9 + ) 10 + 11 + // MetadataExtractor implements multi-strategy metadata extraction from HTML documents. 12 + // It attempts to extract article metadata using OpenGraph, Schema.org, meta tags, 13 + // and semantic HTML5 elements, with fallback chains for each field. 14 + type MetadataExtractor struct{} 15 + 16 + // NewMetadataExtractor creates a new metadata extractor. 17 + func NewMetadataExtractor() *MetadataExtractor { 18 + return &MetadataExtractor{} 19 + } 20 + 21 + // ExtractMetadata extracts all available metadata from an HTML document. 22 + // Returns an ExtractionResult with populated metadata fields. 23 + func (m *MetadataExtractor) ExtractMetadata(doc *html.Node) *ExtractionResult { 24 + if doc == nil { 25 + return &ExtractionResult{} 26 + } 27 + 28 + result := &ExtractionResult{} 29 + 30 + result.Title = m.ExtractTitle(doc) 31 + result.Author = m.ExtractAuthor(doc) 32 + result.PublishedDate = m.ExtractPublishedDate(doc) 33 + result.SiteName = m.ExtractSiteName(doc) 34 + result.Language = m.ExtractLanguage(doc) 35 + 36 + return result 37 + } 38 + 39 + // ExtractTitle extracts the article title using multiple strategies. 40 + // Tries in order: OpenGraph, Schema.org, meta tags, h1, title tag. 41 + func (m *MetadataExtractor) ExtractTitle(doc *html.Node) string { 42 + if doc == nil { 43 + return "" 44 + } 45 + 46 + if title := m.getMetaContent(doc, "property", "og:title"); title != "" { 47 + return title 48 + } 49 + 50 + if title := m.getSchemaOrgField(doc, "headline"); title != "" { 51 + return title 52 + } 53 + 54 + if title := m.getSchemaOrgField(doc, "name"); title != "" { 55 + return title 56 + } 57 + 58 + if title := m.getMetaContent(doc, "name", "twitter:title"); title != "" { 59 + return title 60 + } 61 + 62 + if title := m.getMetaContent(doc, "property", "article:title"); title != "" { 63 + return title 64 + } 65 + 66 + if h1 := htmlquery.FindOne(doc, "//h1"); h1 != nil { 67 + if title := htmlquery.InnerText(h1); title != "" { 68 + return strings.TrimSpace(title) 69 + } 70 + } 71 + 72 + if titleNode := htmlquery.FindOne(doc, "//title"); titleNode != nil { 73 + if title := htmlquery.InnerText(titleNode); title != "" { 74 + return strings.TrimSpace(title) 75 + } 76 + } 77 + 78 + return "" 79 + } 80 + 81 + // ExtractAuthor extracts the article author using multiple strategies. 82 + // Tries in order: OpenGraph, Schema.org, meta tags, rel=author, byline elements. 83 + func (m *MetadataExtractor) ExtractAuthor(doc *html.Node) string { 84 + if doc == nil { 85 + return "" 86 + } 87 + 88 + if author := m.getMetaContent(doc, "property", "og:author"); author != "" { 89 + return author 90 + } 91 + 92 + if author := m.getSchemaOrgField(doc, "author"); author != "" { 93 + return author 94 + } 95 + 96 + if author := m.getMetaContent(doc, "property", "article:author"); author != "" { 97 + return author 98 + } 99 + 100 + if author := m.getMetaContent(doc, "name", "twitter:creator"); author != "" { 101 + return author 102 + } 103 + 104 + if author := m.getMetaContent(doc, "name", "author"); author != "" { 105 + return author 106 + } 107 + 108 + if authorLink := htmlquery.FindOne(doc, "//a[@rel='author']"); authorLink != nil { 109 + if author := htmlquery.InnerText(authorLink); author != "" { 110 + return strings.TrimSpace(author) 111 + } 112 + } 113 + 114 + bylineSelectors := []string{ 115 + "//span[contains(@class, 'author')]", 116 + "//div[contains(@class, 'author')]", 117 + "//p[contains(@class, 'byline')]", 118 + "//span[contains(@class, 'byline')]", 119 + } 120 + 121 + for _, selector := range bylineSelectors { 122 + if node := htmlquery.FindOne(doc, selector); node != nil { 123 + if author := htmlquery.InnerText(node); author != "" { 124 + return strings.TrimSpace(author) 125 + } 126 + } 127 + } 128 + 129 + return "" 130 + } 131 + 132 + // ExtractPublishedDate extracts the publication date using multiple strategies. 133 + // Tries in order: OpenGraph, Schema.org, article:published_time, time elements. 134 + func (m *MetadataExtractor) ExtractPublishedDate(doc *html.Node) string { 135 + if doc == nil { 136 + return "" 137 + } 138 + 139 + if date := m.getMetaContent(doc, "property", "og:published_time"); date != "" { 140 + return date 141 + } 142 + 143 + if date := m.getSchemaOrgField(doc, "datePublished"); date != "" { 144 + return date 145 + } 146 + 147 + if date := m.getSchemaOrgField(doc, "publishDate"); date != "" { 148 + return date 149 + } 150 + 151 + if date := m.getMetaContent(doc, "property", "article:published_time"); date != "" { 152 + return date 153 + } 154 + 155 + if date := m.getMetaContent(doc, "name", "publication_date"); date != "" { 156 + return date 157 + } 158 + 159 + if date := m.getMetaContent(doc, "name", "date"); date != "" { 160 + return date 161 + } 162 + 163 + if timeNode := htmlquery.FindOne(doc, "//time[@datetime]"); timeNode != nil { 164 + for _, attr := range timeNode.Attr { 165 + if attr.Key == "datetime" { 166 + return attr.Val 167 + } 168 + } 169 + } 170 + 171 + return "" 172 + } 173 + 174 + // ExtractSiteName extracts the site name using multiple strategies. 175 + // Tries in order: OpenGraph, Schema.org, meta tags. 176 + func (m *MetadataExtractor) ExtractSiteName(doc *html.Node) string { 177 + if doc == nil { 178 + return "" 179 + } 180 + 181 + if siteName := m.getMetaContent(doc, "property", "og:site_name"); siteName != "" { 182 + return siteName 183 + } 184 + 185 + if publisher := m.getSchemaOrgField(doc, "publisher"); publisher != "" { 186 + return publisher 187 + } 188 + 189 + if siteName := m.getMetaContent(doc, "name", "application-name"); siteName != "" { 190 + return siteName 191 + } 192 + 193 + return "" 194 + } 195 + 196 + // ExtractLanguage extracts the document language. 197 + // Tries in order: html lang attribute, OpenGraph, meta tags. 198 + func (m *MetadataExtractor) ExtractLanguage(doc *html.Node) string { 199 + if doc == nil { 200 + return "" 201 + } 202 + 203 + if htmlNode := htmlquery.FindOne(doc, "//html"); htmlNode != nil { 204 + for _, attr := range htmlNode.Attr { 205 + if attr.Key == "lang" { 206 + return attr.Val 207 + } 208 + } 209 + } 210 + 211 + if locale := m.getMetaContent(doc, "property", "og:locale"); locale != "" { 212 + return locale 213 + } 214 + 215 + if lang := m.getMetaContent(doc, "http-equiv", "content-language"); lang != "" { 216 + return lang 217 + } 218 + 219 + return "" 220 + } 221 + 222 + // getMetaContent retrieves the content attribute from a meta tag. 223 + // Searches for meta tags with the specified attribute name and value. 224 + func (m *MetadataExtractor) getMetaContent(doc *html.Node, attrName, attrValue string) string { 225 + if doc == nil { 226 + return "" 227 + } 228 + 229 + xpath := "//meta[@" + attrName + "='" + attrValue + "']" 230 + metaNode := htmlquery.FindOne(doc, xpath) 231 + 232 + if metaNode == nil { 233 + return "" 234 + } 235 + 236 + for _, attr := range metaNode.Attr { 237 + if attr.Key == "content" { 238 + return strings.TrimSpace(attr.Val) 239 + } 240 + } 241 + 242 + return "" 243 + } 244 + 245 + // getSchemaOrgField extracts a field from Schema.org JSON-LD structured data. 246 + func (m *MetadataExtractor) getSchemaOrgField(doc *html.Node, fieldName string) string { 247 + if doc == nil { 248 + return "" 249 + } 250 + 251 + scripts := htmlquery.Find(doc, "//script[@type='application/ld+json']") 252 + 253 + for _, script := range scripts { 254 + if script.FirstChild == nil || script.FirstChild.Type != html.TextNode { 255 + continue 256 + } 257 + 258 + var data map[string]any 259 + if err := json.Unmarshal([]byte(script.FirstChild.Data), &data); err != nil { 260 + continue 261 + } 262 + 263 + context, hasContext := data["@context"] 264 + typeVal, hasType := data["@type"] 265 + 266 + if !hasContext || !hasType { 267 + continue 268 + } 269 + 270 + contextStr, ok := context.(string) 271 + if !ok || !strings.Contains(contextStr, "schema.org") { 272 + continue 273 + } 274 + 275 + typeStr, ok := typeVal.(string) 276 + if !ok || (!strings.Contains(typeStr, "Article") && !strings.Contains(typeStr, "NewsArticle")) { 277 + continue 278 + } 279 + 280 + if value, exists := data[fieldName]; exists { 281 + return m.extractStringValue(value) 282 + } 283 + } 284 + 285 + return "" 286 + } 287 + 288 + // extractStringValue extracts a string from various JSON value types. 289 + func (m *MetadataExtractor) extractStringValue(value any) string { 290 + switch v := value.(type) { 291 + case string: 292 + return v 293 + case map[string]any: 294 + if name, exists := v["name"]; exists { 295 + if nameStr, ok := name.(string); ok { 296 + return nameStr 297 + } 298 + } 299 + case []any: 300 + if len(v) > 0 { 301 + return m.extractStringValue(v[0]) 302 + } 303 + } 304 + return "" 305 + }

+430

internal/articles/metadata_test.go

··· 1 + package articles 2 + 3 + import ( 4 + "strings" 5 + "testing" 6 + ) 7 + 8 + func TestMetadataExtractor(t *testing.T) { 9 + t.Run("NewMetadataExtractor", func(t *testing.T) { 10 + t.Run("creates extractor", func(t *testing.T) { 11 + extractor := NewMetadataExtractor() 12 + 13 + if extractor == nil { 14 + t.Fatal("Expected extractor to be created, got nil") 15 + } 16 + }) 17 + }) 18 + 19 + t.Run("ExtractTitle", func(t *testing.T) { 20 + extractor := NewMetadataExtractor() 21 + 22 + t.Run("extracts from OpenGraph", func(t *testing.T) { 23 + htmlStr := `<html><head> 24 + <meta property="og:title" content="Article Title from OpenGraph"> 25 + </head><body></body></html>` 26 + doc := parseHTML(htmlStr) 27 + 28 + title := extractor.ExtractTitle(doc) 29 + 30 + if title != "Article Title from OpenGraph" { 31 + t.Errorf("Expected OpenGraph title, got %q", title) 32 + } 33 + }) 34 + 35 + t.Run("extracts from title tag", func(t *testing.T) { 36 + htmlStr := `<html><head> 37 + <title>Page Title from Title Tag</title> 38 + </head><body></body></html>` 39 + doc := parseHTML(htmlStr) 40 + 41 + title := extractor.ExtractTitle(doc) 42 + 43 + if title != "Page Title from Title Tag" { 44 + t.Errorf("Expected title tag content, got %q", title) 45 + } 46 + }) 47 + 48 + t.Run("extracts from h1", func(t *testing.T) { 49 + htmlStr := `<html><body> 50 + <h1>Heading Title</h1> 51 + </body></html>` 52 + doc := parseHTML(htmlStr) 53 + 54 + title := extractor.ExtractTitle(doc) 55 + 56 + if title != "Heading Title" { 57 + t.Errorf("Expected h1 content, got %q", title) 58 + } 59 + }) 60 + 61 + t.Run("returns empty for nil document", func(t *testing.T) { 62 + title := extractor.ExtractTitle(nil) 63 + 64 + if title != "" { 65 + t.Errorf("Expected empty string for nil document, got %q", title) 66 + } 67 + }) 68 + 69 + t.Run("prioritizes OpenGraph over title tag", func(t *testing.T) { 70 + htmlStr := `<html><head> 71 + <meta property="og:title" content="OpenGraph Title"> 72 + <title>HTML Title</title> 73 + </head><body></body></html>` 74 + doc := parseHTML(htmlStr) 75 + 76 + title := extractor.ExtractTitle(doc) 77 + 78 + if title != "OpenGraph Title" { 79 + t.Errorf("Expected OpenGraph title to have priority, got %q", title) 80 + } 81 + }) 82 + }) 83 + 84 + t.Run("ExtractAuthor", func(t *testing.T) { 85 + extractor := NewMetadataExtractor() 86 + 87 + t.Run("extracts from OpenGraph", func(t *testing.T) { 88 + htmlStr := `<html><head> 89 + <meta property="og:author" content="John Doe"> 90 + </head><body></body></html>` 91 + doc := parseHTML(htmlStr) 92 + 93 + author := extractor.ExtractAuthor(doc) 94 + 95 + if author != "John Doe" { 96 + t.Errorf("Expected OpenGraph author, got %q", author) 97 + } 98 + }) 99 + 100 + t.Run("extracts from meta tag", func(t *testing.T) { 101 + htmlStr := `<html><head> 102 + <meta name="author" content="Jane Smith"> 103 + </head><body></body></html>` 104 + doc := parseHTML(htmlStr) 105 + 106 + author := extractor.ExtractAuthor(doc) 107 + 108 + if author != "Jane Smith" { 109 + t.Errorf("Expected meta tag author, got %q", author) 110 + } 111 + }) 112 + 113 + t.Run("extracts from rel=author link", func(t *testing.T) { 114 + htmlStr := `<html><body> 115 + <a rel="author" href="/author/bob">Bob Johnson</a> 116 + </body></html>` 117 + doc := parseHTML(htmlStr) 118 + 119 + author := extractor.ExtractAuthor(doc) 120 + 121 + if author != "Bob Johnson" { 122 + t.Errorf("Expected rel=author link text, got %q", author) 123 + } 124 + }) 125 + 126 + t.Run("extracts from byline class", func(t *testing.T) { 127 + htmlStr := `<html><body> 128 + <span class="author-name">Alice Brown</span> 129 + </body></html>` 130 + doc := parseHTML(htmlStr) 131 + 132 + author := extractor.ExtractAuthor(doc) 133 + 134 + if author != "Alice Brown" { 135 + t.Errorf("Expected byline class text, got %q", author) 136 + } 137 + }) 138 + 139 + t.Run("returns empty for nil document", func(t *testing.T) { 140 + author := extractor.ExtractAuthor(nil) 141 + 142 + if author != "" { 143 + t.Errorf("Expected empty string for nil document, got %q", author) 144 + } 145 + }) 146 + }) 147 + 148 + t.Run("ExtractPublishedDate", func(t *testing.T) { 149 + extractor := NewMetadataExtractor() 150 + 151 + t.Run("extracts from OpenGraph", func(t *testing.T) { 152 + htmlStr := `<html><head> 153 + <meta property="og:published_time" content="2025-01-15T10:00:00Z"> 154 + </head><body></body></html>` 155 + doc := parseHTML(htmlStr) 156 + 157 + date := extractor.ExtractPublishedDate(doc) 158 + 159 + if date != "2025-01-15T10:00:00Z" { 160 + t.Errorf("Expected OpenGraph date, got %q", date) 161 + } 162 + }) 163 + 164 + t.Run("extracts from article:published_time", func(t *testing.T) { 165 + htmlStr := `<html><head> 166 + <meta property="article:published_time" content="2025-02-20"> 167 + </head><body></body></html>` 168 + doc := parseHTML(htmlStr) 169 + 170 + date := extractor.ExtractPublishedDate(doc) 171 + 172 + if date != "2025-02-20" { 173 + t.Errorf("Expected article:published_time, got %q", date) 174 + } 175 + }) 176 + 177 + t.Run("extracts from time element", func(t *testing.T) { 178 + htmlStr := `<html><body> 179 + <time datetime="2025-03-25T14:30:00">March 25, 2025</time> 180 + </body></html>` 181 + doc := parseHTML(htmlStr) 182 + 183 + date := extractor.ExtractPublishedDate(doc) 184 + 185 + if date != "2025-03-25T14:30:00" { 186 + t.Errorf("Expected time element datetime, got %q", date) 187 + } 188 + }) 189 + 190 + t.Run("returns empty for nil document", func(t *testing.T) { 191 + date := extractor.ExtractPublishedDate(nil) 192 + 193 + if date != "" { 194 + t.Errorf("Expected empty string for nil document, got %q", date) 195 + } 196 + }) 197 + }) 198 + 199 + t.Run("ExtractSiteName", func(t *testing.T) { 200 + extractor := NewMetadataExtractor() 201 + 202 + t.Run("extracts from OpenGraph", func(t *testing.T) { 203 + htmlStr := `<html><head> 204 + <meta property="og:site_name" content="Example News"> 205 + </head><body></body></html>` 206 + doc := parseHTML(htmlStr) 207 + 208 + siteName := extractor.ExtractSiteName(doc) 209 + 210 + if siteName != "Example News" { 211 + t.Errorf("Expected OpenGraph site_name, got %q", siteName) 212 + } 213 + }) 214 + 215 + t.Run("extracts from application-name", func(t *testing.T) { 216 + htmlStr := `<html><head> 217 + <meta name="application-name" content="Tech Blog"> 218 + </head><body></body></html>` 219 + doc := parseHTML(htmlStr) 220 + 221 + siteName := extractor.ExtractSiteName(doc) 222 + 223 + if siteName != "Tech Blog" { 224 + t.Errorf("Expected application-name, got %q", siteName) 225 + } 226 + }) 227 + 228 + t.Run("returns empty for nil document", func(t *testing.T) { 229 + siteName := extractor.ExtractSiteName(nil) 230 + 231 + if siteName != "" { 232 + t.Errorf("Expected empty string for nil document, got %q", siteName) 233 + } 234 + }) 235 + }) 236 + 237 + t.Run("ExtractLanguage", func(t *testing.T) { 238 + extractor := NewMetadataExtractor() 239 + 240 + t.Run("extracts from html lang attribute", func(t *testing.T) { 241 + htmlStr := `<html lang="en-US"><body></body></html>` 242 + doc := parseHTML(htmlStr) 243 + 244 + lang := extractor.ExtractLanguage(doc) 245 + 246 + if lang != "en-US" { 247 + t.Errorf("Expected html lang attribute, got %q", lang) 248 + } 249 + }) 250 + 251 + t.Run("extracts from OpenGraph locale", func(t *testing.T) { 252 + htmlStr := `<html><head> 253 + <meta property="og:locale" content="fr-FR"> 254 + </head><body></body></html>` 255 + doc := parseHTML(htmlStr) 256 + 257 + lang := extractor.ExtractLanguage(doc) 258 + 259 + if lang != "fr-FR" { 260 + t.Errorf("Expected OpenGraph locale, got %q", lang) 261 + } 262 + }) 263 + 264 + t.Run("returns empty for nil document", func(t *testing.T) { 265 + lang := extractor.ExtractLanguage(nil) 266 + 267 + if lang != "" { 268 + t.Errorf("Expected empty string for nil document, got %q", lang) 269 + } 270 + }) 271 + }) 272 + 273 + t.Run("getSchemaOrgField", func(t *testing.T) { 274 + extractor := NewMetadataExtractor() 275 + 276 + t.Run("extracts from JSON-LD Article", func(t *testing.T) { 277 + htmlStr := `<html><head> 278 + <script type="application/ld+json"> 279 + { 280 + "@context": "https://schema.org", 281 + "@type": "Article", 282 + "headline": "Test Article", 283 + "author": "Test Author", 284 + "datePublished": "2025-01-15" 285 + } 286 + </script> 287 + </head><body></body></html>` 288 + doc := parseHTML(htmlStr) 289 + 290 + headline := extractor.getSchemaOrgField(doc, "headline") 291 + author := extractor.getSchemaOrgField(doc, "author") 292 + date := extractor.getSchemaOrgField(doc, "datePublished") 293 + 294 + if headline != "Test Article" { 295 + t.Errorf("Expected headline from JSON-LD, got %q", headline) 296 + } 297 + 298 + if author != "Test Author" { 299 + t.Errorf("Expected author from JSON-LD, got %q", author) 300 + } 301 + 302 + if date != "2025-01-15" { 303 + t.Errorf("Expected datePublished from JSON-LD, got %q", date) 304 + } 305 + }) 306 + 307 + t.Run("extracts from NewsArticle type", func(t *testing.T) { 308 + htmlStr := `<html><head> 309 + <script type="application/ld+json"> 310 + { 311 + "@context": "https://schema.org", 312 + "@type": "NewsArticle", 313 + "headline": "Breaking News" 314 + } 315 + </script> 316 + </head><body></body></html>` 317 + doc := parseHTML(htmlStr) 318 + 319 + headline := extractor.getSchemaOrgField(doc, "headline") 320 + 321 + if headline != "Breaking News" { 322 + t.Errorf("Expected headline from NewsArticle, got %q", headline) 323 + } 324 + }) 325 + 326 + t.Run("handles nested author object", func(t *testing.T) { 327 + htmlStr := `<html><head> 328 + <script type="application/ld+json"> 329 + { 330 + "@context": "https://schema.org", 331 + "@type": "Article", 332 + "author": { 333 + "@type": "Person", 334 + "name": "Nested Author" 335 + } 336 + } 337 + </script> 338 + </head><body></body></html>` 339 + doc := parseHTML(htmlStr) 340 + 341 + author := extractor.getSchemaOrgField(doc, "author") 342 + 343 + if author != "Nested Author" { 344 + t.Errorf("Expected nested author name, got %q", author) 345 + } 346 + }) 347 + 348 + t.Run("returns empty for invalid JSON", func(t *testing.T) { 349 + htmlStr := `<html><head> 350 + <script type="application/ld+json"> 351 + { invalid json } 352 + </script> 353 + </head><body></body></html>` 354 + doc := parseHTML(htmlStr) 355 + 356 + result := extractor.getSchemaOrgField(doc, "headline") 357 + 358 + if result != "" { 359 + t.Errorf("Expected empty for invalid JSON, got %q", result) 360 + } 361 + }) 362 + 363 + t.Run("returns empty for non-Article types", func(t *testing.T) { 364 + htmlStr := `<html><head> 365 + <script type="application/ld+json"> 366 + { 367 + "@context": "https://schema.org", 368 + "@type": "WebPage", 369 + "headline": "Not an article" 370 + } 371 + </script> 372 + </head><body></body></html>` 373 + doc := parseHTML(htmlStr) 374 + 375 + result := extractor.getSchemaOrgField(doc, "headline") 376 + 377 + if result != "" { 378 + t.Errorf("Expected empty for WebPage type, got %q", result) 379 + } 380 + }) 381 + }) 382 + 383 + t.Run("ExtractMetadata", func(t *testing.T) { 384 + extractor := NewMetadataExtractor() 385 + 386 + t.Run("extracts all metadata fields", func(t *testing.T) { 387 + htmlStr := `<html lang="en"><head> 388 + <title>Full Article Title</title> 389 + <meta property="og:author" content="Full Name"> 390 + <meta property="article:published_time" content="2025-01-20"> 391 + <meta property="og:site_name" content="News Site"> 392 + </head><body></body></html>` 393 + doc := parseHTML(htmlStr) 394 + 395 + result := extractor.ExtractMetadata(doc) 396 + 397 + if result == nil { 398 + t.Fatal("Expected result, got nil") 399 + } 400 + 401 + if !strings.Contains(result.Title, "Full Article Title") { 402 + t.Errorf("Expected title to be extracted, got %q", result.Title) 403 + } 404 + 405 + if result.Author != "Full Name" { 406 + t.Errorf("Expected author to be extracted, got %q", result.Author) 407 + } 408 + 409 + if result.PublishedDate != "2025-01-20" { 410 + t.Errorf("Expected date to be extracted, got %q", result.PublishedDate) 411 + } 412 + 413 + if result.SiteName != "News Site" { 414 + t.Errorf("Expected site name to be extracted, got %q", result.SiteName) 415 + } 416 + 417 + if result.Language != "en" { 418 + t.Errorf("Expected language to be extracted, got %q", result.Language) 419 + } 420 + }) 421 + 422 + t.Run("returns empty result for nil document", func(t *testing.T) { 423 + result := extractor.ExtractMetadata(nil) 424 + 425 + if result == nil { 426 + t.Error("Expected empty result, got nil") 427 + } 428 + }) 429 + }) 430 + }

+100 -20

internal/articles/parser.go

··· 26 26 27 27 // ParsedContent represents the extracted content from a web page 28 28 type ParsedContent struct { 29 - Title string 30 - Author string 31 - Date string 32 - Content string 33 - URL string 29 + Title string 30 + Author string 31 + Date string 32 + Content string 33 + URL string 34 + Confidence float64 // 0-1 scale, confidence in extraction quality 35 + ExtractionMethod string // "xpath", "heuristic", "dual-validated", etc. 34 36 } 35 37 36 38 // ParsingRule represents XPath rules for extracting content from a specific domain ··· 62 64 63 65 // ArticleParser implements the Parser interface 64 66 type ArticleParser struct { 65 - rules map[string]*ParsingRule 66 - client *http.Client 67 + rules map[string]*ParsingRule 68 + client *http.Client 69 + heuristicExtract *HeuristicExtractor 70 + metadataExtractor *MetadataExtractor 67 71 } 68 72 69 73 // NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules 70 74 func NewArticleParser(client *http.Client) (*ArticleParser, error) { 71 75 parser := &ArticleParser{ 72 - rules: make(map[string]*ParsingRule), 73 - client: client, 76 + rules: make(map[string]*ParsingRule), 77 + client: client, 78 + heuristicExtract: NewHeuristicExtractor(), 79 + metadataExtractor: NewMetadataExtractor(), 74 80 } 75 81 76 82 if err := parser.loadRules(); err != nil { ··· 83 89 // AddRule adds or replaces a parsing rule for a specific domain 84 90 func (p *ArticleParser) AddRule(domain string, rule *ParsingRule) { 85 91 p.rules[domain] = rule 92 + } 93 + 94 + // SetHTTPClient overrides the HTTP client used for fetching article content. 95 + func (p *ArticleParser) SetHTTPClient(client *http.Client) { 96 + p.client = client 86 97 } 87 98 88 99 func (p *ArticleParser) loadRules() error { ··· 197 208 } 198 209 199 210 domain := parsedURL.Hostname() 200 - 201 211 rule := p.findRule(domain) 202 - 203 212 req, err := http.NewRequest(http.MethodGet, s, nil) 213 + 204 214 if err != nil { 205 215 return nil, fmt.Errorf("failed to create request: %w", err) 206 216 } ··· 234 244 return p.Parse(string(htmlBytes), domain, s) 235 245 } 236 246 237 - // ParseHTML extracts article content from HTML string using domain-specific rules 247 + // ParseHTML extracts article content from HTML string using domain-specific rules with heuristic fallback. 248 + // Implements dual validation: compares XPath results with heuristic extraction when rules exist. 238 249 func (p *ArticleParser) Parse(htmlContent, domain, sourceURL string) (*ParsedContent, error) { 250 + doc, err := htmlquery.Parse(strings.NewReader(htmlContent)) 251 + if err != nil { 252 + return nil, fmt.Errorf("failed to parse HTML: %w", err) 253 + } 254 + 239 255 rule := p.findRule(domain) 240 256 241 257 if rule == nil { 242 - return nil, fmt.Errorf("no parsing rule found for domain: %s", domain) 258 + return p.parseWithHeuristics(doc, sourceURL) 243 259 } 244 260 245 - doc, err := htmlquery.Parse(strings.NewReader(htmlContent)) 246 - if err != nil { 247 - return nil, fmt.Errorf("failed to parse HTML: %w", err) 261 + content := &ParsedContent{ 262 + URL: sourceURL, 263 + ExtractionMethod: "xpath", 264 + Confidence: 0.85, 248 265 } 249 266 250 - content := &ParsedContent{URL: sourceURL} 251 - 252 267 if rule.Title != "" { 253 268 if titleNode := htmlquery.FindOne(doc, rule.Title); titleNode != nil { 254 269 content.Title = strings.TrimSpace(htmlquery.InnerText(titleNode)) 255 270 } 256 271 } 272 + if content.Title == "" { 273 + content.Title = p.metadataExtractor.ExtractTitle(doc) 274 + } 257 275 258 276 if rule.Author != "" { 259 277 if authorNode := htmlquery.FindOne(doc, rule.Author); authorNode != nil { 260 278 content.Author = strings.TrimSpace(htmlquery.InnerText(authorNode)) 261 279 } 280 + } 281 + if content.Author == "" { 282 + content.Author = p.metadataExtractor.ExtractAuthor(doc) 262 283 } 263 284 264 285 if rule.Date != "" { 265 286 if dateNode := htmlquery.FindOne(doc, rule.Date); dateNode != nil { 266 287 content.Date = strings.TrimSpace(htmlquery.InnerText(dateNode)) 267 288 } 289 + } 290 + if content.Date == "" { 291 + content.Date = p.metadataExtractor.ExtractPublishedDate(doc) 268 292 } 269 293 270 294 if rule.Body != "" { 271 295 bodyNode := htmlquery.FindOne(doc, rule.Body) 272 296 if bodyNode == nil { 273 - return nil, fmt.Errorf("could not extract body content from HTML") 297 + return p.parseWithHeuristics(doc, sourceURL) 274 298 } 275 299 276 300 for _, stripXPath := range rule.Strip { ··· 283 307 284 308 removeDefaultNonContentNodes(bodyNode) 285 309 286 - content.Content = normalizeWhitespace(htmlquery.InnerText(bodyNode)) 310 + xpathContent := normalizeWhitespace(htmlquery.InnerText(bodyNode)) 311 + 312 + heuristicResult := p.heuristicExtract.CompareWithXPath(doc, bodyNode) 313 + if heuristicResult != nil { 314 + content.Content = heuristicResult.Content 315 + if content.Content == "" { 316 + content.Content = xpathContent 317 + } 318 + content.Confidence = heuristicResult.Confidence 319 + content.ExtractionMethod = heuristicResult.ExtractionMethod 320 + } else { 321 + content.Content = xpathContent 322 + } 287 323 } 288 324 289 325 if content.Title == "" { 290 326 return nil, fmt.Errorf("could not extract title from HTML") 327 + } 328 + 329 + return content, nil 330 + } 331 + 332 + // parseWithHeuristics performs heuristic-only extraction when no XPath rule exists. 333 + func (p *ArticleParser) parseWithHeuristics(doc *exhtml.Node, sourceURL string) (*ParsedContent, error) { 334 + result := p.heuristicExtract.ExtractWithSemanticHTML(doc) 335 + if result == nil { 336 + result = &ExtractionResult{ 337 + ExtractionMethod: "heuristic-failed", 338 + Confidence: 0.0, 339 + } 340 + } 341 + 342 + metadata := p.metadataExtractor.ExtractMetadata(doc) 343 + if metadata != nil { 344 + if result.Title == "" { 345 + result.Title = metadata.Title 346 + } 347 + if result.Author == "" { 348 + result.Author = metadata.Author 349 + } 350 + if result.PublishedDate == "" { 351 + result.PublishedDate = metadata.PublishedDate 352 + } 353 + } 354 + 355 + content := &ParsedContent{ 356 + Title: result.Title, 357 + Author: result.Author, 358 + Date: result.PublishedDate, 359 + Content: result.Content, 360 + URL: sourceURL, 361 + Confidence: result.Confidence, 362 + ExtractionMethod: result.ExtractionMethod, 363 + } 364 + 365 + if content.Title == "" { 366 + return nil, fmt.Errorf("could not extract title from HTML using heuristics") 367 + } 368 + 369 + if content.Confidence < 0.3 { 370 + return nil, fmt.Errorf("heuristic extraction confidence too low (%.2f)", content.Confidence) 291 371 } 292 372 293 373 return content, nil

+274 -106

internal/articles/parser_test.go

··· 1 1 package articles 2 2 3 3 import ( 4 + "errors" 4 5 "fmt" 6 + "io" 5 7 "net/http" 6 - "net/http/httptest" 7 8 "os" 8 9 "strings" 9 10 "testing" ··· 11 12 12 13 "github.com/stormlightlabs/noteleaf/internal/models" 13 14 ) 14 - 15 - func newServerWithHtml(h string) *httptest.Server { 16 - return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 17 - w.WriteHeader(http.StatusOK) 18 - w.Write([]byte(h)) 19 - })) 20 - } 21 15 22 16 // ExampleParser_Convert demonstrates parsing a local HTML file using Wikipedia rules. 23 17 func ExampleParser_Convert() { ··· 58 52 59 53 // Output: # Christopher Lloyd 60 54 // 61 - // **Source:** https://en.wikipedia.org/wiki/Christopher_Lloyd 55 + // **Author:** Contributors to Wikimedia projects 62 56 } 63 57 64 58 func TestArticleParser(t *testing.T) { ··· 197 191 if err == nil { 198 192 t.Error("Expected error for unsupported domain") 199 193 } 200 - if !strings.Contains(err.Error(), "no parsing rule found") { 201 - t.Errorf("Expected 'no parsing rule found' error, got %v", err) 194 + 195 + if !strings.Contains(err.Error(), "confidence too low") && 196 + !strings.Contains(err.Error(), "could not extract title") { 197 + t.Errorf("Expected heuristic extraction error, got %v", err) 202 198 } 203 199 }) 204 200 ··· 218 214 if err == nil { 219 215 t.Error("Expected error when no title can be extracted") 220 216 } 217 + 221 218 if !strings.Contains(err.Error(), "could not extract title") && 222 - !strings.Contains(err.Error(), "could not extract body content") { 223 - t.Errorf("Expected title or body extraction error, got %v", err) 219 + !strings.Contains(err.Error(), "could not extract body content") && 220 + !strings.Contains(err.Error(), "confidence too low") { 221 + t.Errorf("Expected title, body, or confidence error, got %v", err) 224 222 } 225 223 }) 226 224 ··· 334 332 t.Error("Expected footer content to be stripped") 335 333 } 336 334 }) 335 + 336 + t.Run("uses heuristic extraction for unsupported domain with semantic HTML", func(t *testing.T) { 337 + htmlContent := `<html><head> 338 + <title>Heuristic Test Article</title> 339 + <meta property="og:author" content="Heuristic Author"> 340 + <meta property="article:published_time" content="2025-01-15"> 341 + </head><body> 342 + <article> 343 + <p>This is a substantial article that should be extracted using heuristic methods.</p> 344 + <p>It contains multiple paragraphs with sufficient content for the readability algorithm.</p> 345 + <p>The heuristic extractor should successfully identify this as main content.</p> 346 + </article> 347 + </body></html>` 348 + 349 + markdown, err := parser.Convert(htmlContent, "unsupported-domain.com", "https://unsupported-domain.com/article") 350 + 351 + if err == nil { 352 + if !strings.Contains(markdown, "substantial article") { 353 + t.Error("Expected markdown to contain extracted content") 354 + } 355 + } 356 + }) 357 + 358 + t.Run("includes confidence score in parsed content", func(t *testing.T) { 359 + htmlContent := `<html> 360 + <head><title>Confidence Test</title></head> 361 + <body> 362 + <h1 id="firstHeading">Confidence Test Article</h1> 363 + <div id="bodyContent"> 364 + <p>Article content for confidence testing.</p> 365 + </div> 366 + </body> 367 + </html>` 368 + 369 + content, err := parser.Parse(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Confidence") 370 + if err != nil { 371 + t.Fatalf("Expected no error, got %v", err) 372 + } 373 + 374 + if content.Confidence == 0.0 { 375 + t.Error("Expected non-zero confidence score") 376 + } 377 + 378 + if content.ExtractionMethod == "" { 379 + t.Error("Expected extraction method to be set") 380 + } 381 + }) 382 + 383 + t.Run("falls back to metadata extractor when XPath fails", func(t *testing.T) { 384 + htmlContent := `<html><head> 385 + <title>Metadata Fallback Test</title> 386 + <meta property="og:author" content="Metadata Author"> 387 + <meta property="article:published_time" content="2025-01-20"> 388 + </head><body> 389 + <h1 id="firstHeading">Fallback Test</h1> 390 + <div id="bodyContent"> 391 + <p>Content without author or date in XPath locations.</p> 392 + </div> 393 + </body></html>` 394 + 395 + content, err := parser.Parse(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Metadata_Test") 396 + if err != nil { 397 + t.Fatalf("Expected no error, got %v", err) 398 + } 399 + 400 + if content.Author != "Metadata Author" { 401 + t.Errorf("Expected metadata fallback for author, got %q", content.Author) 402 + } 403 + 404 + if content.Date != "2025-01-20" { 405 + t.Errorf("Expected metadata fallback for date, got %q", content.Date) 406 + } 407 + }) 337 408 }) 338 409 339 410 t.Run("ParseURL", func(t *testing.T) { 340 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 341 - switch { 342 - case strings.Contains(r.URL.Path, "404"): 343 - w.WriteHeader(http.StatusNotFound) 344 - case strings.Contains(r.URL.Path, "unsupported"): 345 - w.WriteHeader(http.StatusOK) 346 - w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>")) 347 - default: 348 - w.WriteHeader(http.StatusOK) 349 - w.Write([]byte(`<html> 411 + parser, err := NewArticleParser(http.DefaultClient) 412 + if err != nil { 413 + t.Fatalf("Failed to create parser: %v", err) 414 + } 415 + 416 + localhostRule := &ParsingRule{ 417 + Domain: "example.com", 418 + Title: "//h1[@id='firstHeading']", 419 + Body: "//div[@id='bodyContent']", 420 + Strip: []string{"//div[@class='noprint']"}, 421 + } 422 + parser.AddRule("example.com", localhostRule) 423 + 424 + const ( 425 + validURL = "https://example.com/wiki/test" 426 + httpErrorURL = "https://example.com/wiki/404" 427 + unsupportedURL = "https://unsupported-domain.test/article" 428 + ) 429 + 430 + parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 431 + switch req.URL.String() { 432 + case validURL: 433 + return htmlResponse(http.StatusOK, `<html> 350 434 <head><title>Test Article</title></head> 351 435 <body> 352 436 <h1 id="firstHeading">Test Wikipedia Article</h1> ··· 355 439 <div class="noprint">This gets stripped</div> 356 440 </div> 357 441 </body> 358 - </html>`)) 442 + </html>`), nil 443 + case httpErrorURL: 444 + return &http.Response{ 445 + StatusCode: http.StatusNotFound, 446 + Header: make(http.Header), 447 + Body: io.NopCloser(strings.NewReader("")), 448 + }, nil 449 + case unsupportedURL: 450 + return htmlResponse(http.StatusOK, `<html><head><title>Unsupported</title></head><body><p>Content</p></body></html>`), nil 451 + default: 452 + return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 359 453 } 360 454 })) 361 - defer server.Close() 362 - 363 - parser, err := NewArticleParser(server.Client()) 364 - if err != nil { 365 - t.Fatalf("Failed to create parser: %v", err) 366 - } 367 - 368 - localhostRule := &ParsingRule{ 369 - Domain: "127.0.0.1", 370 - Title: "//h1[@id='firstHeading']", 371 - Body: "//div[@id='bodyContent']", 372 - Strip: []string{"//div[@class='noprint']"}, 373 - } 374 - parser.AddRule("127.0.0.1", localhostRule) 375 455 376 456 t.Run("fails with invalid URL", func(t *testing.T) { 377 457 _, err := parser.ParseURL("not-a-url") 378 458 if err == nil { 379 459 t.Error("Expected error for invalid URL") 380 460 } 381 - if !strings.Contains(err.Error(), "unsupported protocol scheme") { 382 - t.Errorf("Expected 'unsupported protocol scheme' error, got %v", err) 461 + if !strings.Contains(err.Error(), "unsupported protocol scheme") && 462 + !strings.Contains(err.Error(), "failed to fetch URL") && 463 + !strings.Contains(err.Error(), "invalid URL") { 464 + t.Errorf("Expected URL scheme error, got %v", err) 383 465 } 384 466 }) 385 467 386 468 t.Run("fails with unsupported domain", func(t *testing.T) { 387 - _, err := parser.ParseURL(server.URL + "/unsupported.com") 469 + _, err := parser.ParseURL(unsupportedURL) 388 470 if err == nil { 389 471 t.Error("Expected error for unsupported domain") 390 472 } 391 473 }) 392 474 393 475 t.Run("fails with HTTP error", func(t *testing.T) { 394 - _, err := parser.ParseURL(server.URL + "/404/en.wikipedia.org/wiki/test") 476 + _, err := parser.ParseURL(httpErrorURL) 395 477 if err == nil { 396 478 t.Error("Expected error for HTTP 404") 479 + } 480 + }) 481 + 482 + t.Run("successfully parses supported domain", func(t *testing.T) { 483 + content, err := parser.ParseURL(validURL) 484 + if err != nil { 485 + t.Fatalf("Expected no error, got %v", err) 486 + } 487 + if content == nil { 488 + t.Fatal("Expected parsed content, got nil") 489 + } 490 + if content.Title != "Test Wikipedia Article" { 491 + t.Errorf("Expected title to be extracted, got %q", content.Title) 492 + } 493 + if !strings.Contains(content.Content, "This is the article content.") { 494 + t.Errorf("Expected content to include article text, got %q", content.Content) 495 + } 496 + if strings.Contains(content.Content, "This gets stripped") { 497 + t.Error("Expected strip rules to remove non-content nodes") 397 498 } 398 499 }) 399 500 ··· 594 695 }) 595 696 596 697 t.Run("fails with unsupported domain", func(t *testing.T) { 597 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 598 - w.WriteHeader(http.StatusOK) 599 - w.Write([]byte("<html><head><title>Test</title></head><body><p>Content</p></body></html>")) 600 - })) 601 - defer server.Close() 698 + unsupportedURL := "https://unsupported-domain.test/article" 699 + withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) { 700 + if req.URL.String() == unsupportedURL { 701 + return htmlResponse(http.StatusOK, "<html><body><div>Too little content</div></body></html>"), nil 702 + } 703 + return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 704 + }) 602 705 603 - _, err := CreateArticleFromURL(server.URL, tempDir) 706 + _, err := CreateArticleFromURL(unsupportedURL, tempDir) 604 707 if err == nil { 605 708 t.Error("Expected error for unsupported domain") 606 709 } 607 - if !strings.Contains(err.Error(), "no parsing rule found") { 608 - t.Errorf("Expected 'no parsing rule found' error, got %v", err) 710 + if !strings.Contains(err.Error(), "confidence too low") && 711 + !strings.Contains(err.Error(), "could not extract title") { 712 + t.Errorf("Expected heuristic extraction error, got %v", err) 609 713 } 610 714 }) 611 715 612 716 t.Run("fails with HTTP error", func(t *testing.T) { 613 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 614 - w.WriteHeader(http.StatusNotFound) 615 - })) 616 - defer server.Close() 717 + errorURL := "https://example.com/missing" 718 + withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) { 719 + if req.URL.String() == errorURL { 720 + return &http.Response{ 721 + StatusCode: http.StatusNotFound, 722 + Header: make(http.Header), 723 + Body: io.NopCloser(strings.NewReader("")), 724 + }, nil 725 + } 726 + return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 727 + }) 617 728 618 - _, err := CreateArticleFromURL("https://en.wikipedia.org/wiki/NonExistentPage12345", tempDir) 729 + _, err := CreateArticleFromURL(errorURL, tempDir) 619 730 if err == nil { 620 731 t.Error("Expected error for HTTP 404") 621 732 } ··· 625 736 }) 626 737 627 738 t.Run("fails with network error", func(t *testing.T) { 628 - _, err := CreateArticleFromURL("http://localhost:99999/test", tempDir) 739 + networkURL := "https://example.com/network" 740 + withDefaultHTTPClient(t, func(req *http.Request) (*http.Response, error) { 741 + if req.URL.String() == networkURL { 742 + return nil, errors.New("dial error") 743 + } 744 + return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 745 + }) 746 + 747 + _, err := CreateArticleFromURL(networkURL, tempDir) 629 748 if err == nil { 630 749 t.Error("Expected error for network failure") 631 750 } ··· 635 754 }) 636 755 637 756 t.Run("fails with malformed HTML", func(t *testing.T) { 638 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 639 - w.WriteHeader(http.StatusOK) 640 - w.Write([]byte("<html><head><title>Test</head></body>")) 641 - })) 642 - defer server.Close() 643 - 644 - parser, err := NewArticleParser(server.Client()) 757 + parser, err := NewArticleParser(http.DefaultClient) 645 758 if err != nil { 646 759 t.Fatalf("Failed to create parser: %v", err) 647 760 } 648 761 649 762 localhostRule := &ParsingRule{ 650 - Domain: "127.0.0.1", 763 + Domain: "example.com", 651 764 Title: "//h1[@id='firstHeading']", 652 765 Body: "//div[@id='bodyContent']", 653 766 Strip: []string{"//div[@class='noprint']"}, 654 767 } 655 - parser.AddRule("127.0.0.1", localhostRule) 768 + parser.AddRule("example.com", localhostRule) 656 769 657 - _, err = parser.ParseURL(server.URL) 770 + malformedURL := "https://example.com/malformed" 771 + parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 772 + if req.URL.String() == malformedURL { 773 + return htmlResponse(http.StatusOK, "<html><head><title>Test</head></body>"), nil 774 + } 775 + return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 776 + })) 777 + 778 + _, err = parser.ParseURL(malformedURL) 658 779 if err == nil { 659 780 t.Error("Expected error for malformed HTML") 660 781 } 782 + 661 783 if !strings.Contains(err.Error(), "failed to parse HTML") && 662 784 !strings.Contains(err.Error(), "could not extract title") && 663 - !strings.Contains(err.Error(), "could not extract body content") { 785 + !strings.Contains(err.Error(), "could not extract body content") && 786 + !strings.Contains(err.Error(), "confidence too low") { 664 787 t.Errorf("Expected HTML parsing or extraction error, got %v", err) 665 788 } 666 789 }) 667 790 668 791 t.Run("fails when no title can be extracted", func(t *testing.T) { 669 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 670 - w.WriteHeader(http.StatusOK) 671 - w.Write([]byte(`<html> 672 - <head><title>Test</title></head> 673 - <body> 674 - <div id="bodyContent"> 675 - <p>Content without proper title</p> 676 - </div> 677 - </body> 678 - </html>`)) 679 - })) 680 - defer server.Close() 681 - 682 - parser, err := NewArticleParser(server.Client()) 792 + parser, err := NewArticleParser(http.DefaultClient) 683 793 if err != nil { 684 794 t.Fatalf("Failed to create parser: %v", err) 685 795 } 686 796 687 797 localhostRule := &ParsingRule{ 688 - Domain: "127.0.0.1", 798 + Domain: "example.com", 689 799 Title: "//h1[@id='firstHeading']", 690 800 Body: "//div[@id='bodyContent']", 691 801 Strip: []string{"//div[@class='noprint']"}, 692 802 } 693 - parser.AddRule("127.0.0.1", localhostRule) 803 + parser.AddRule("example.com", localhostRule) 694 804 695 - _, err = parser.ParseURL(server.URL) 696 - if err == nil { 697 - t.Error("Expected error when no title can be extracted") 698 - } 699 - if !strings.Contains(err.Error(), "could not extract title") { 700 - t.Errorf("Expected 'could not extract title' error, got %v", err) 805 + noTitleURL := "https://example.com/notitle" 806 + parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 807 + if req.URL.String() == noTitleURL { 808 + return htmlResponse(http.StatusOK, `<html> 809 + <head><title>Test</title></head> 810 + <body> 811 + <div id="bodyContent"> 812 + <p>Content without proper title</p> 813 + </div> 814 + </body> 815 + </html>`), nil 816 + } 817 + return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 818 + })) 819 + 820 + result, err := parser.ParseURL(noTitleURL) 821 + 822 + if err != nil { 823 + if !strings.Contains(err.Error(), "could not extract title") && 824 + !strings.Contains(err.Error(), "confidence too low") { 825 + t.Errorf("Expected title extraction error, got %v", err) 826 + } 827 + } else if result != nil { 828 + if result.Title == "" { 829 + t.Error("Expected title to be extracted via metadata fallback") 830 + } 701 831 } 702 832 }) 703 833 ··· 714 844 </body> 715 845 </html>` 716 846 717 - server := newServerWithHtml(wikipediaHTML) 718 - defer server.Close() 719 - 720 - parser, err := NewArticleParser(server.Client()) 847 + parser, err := NewArticleParser(http.DefaultClient) 721 848 if err != nil { 722 849 t.Fatalf("Failed to create parser: %v", err) 723 850 } 724 851 725 852 localhostRule := &ParsingRule{ 726 - Domain: "127.0.0.1", 853 + Domain: "example.com", 727 854 Title: "//h1[@id='firstHeading']", 728 855 Body: "//div[@id='bodyContent']", 729 856 Strip: []string{"//div[@class='noprint']"}, 730 857 } 731 - parser.AddRule("127.0.0.1", localhostRule) 858 + parser.AddRule("example.com", localhostRule) 732 859 733 - content, err := parser.ParseURL(server.URL) 860 + contentURL := "https://example.com/integration" 861 + parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 862 + if req.URL.String() == contentURL { 863 + return htmlResponse(http.StatusOK, wikipediaHTML), nil 864 + } 865 + return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 866 + })) 867 + 868 + content, err := parser.ParseURL(contentURL) 734 869 if err != nil { 735 870 t.Fatalf("Expected no error, got %v", err) 736 871 } ··· 741 876 } 742 877 743 878 article := &models.Article{ 744 - URL: server.URL, 879 + URL: contentURL, 745 880 Title: content.Title, 746 881 MarkdownPath: mdPath, 747 882 HTMLPath: htmlPath, ··· 752 887 if article.Title != "Integration Test Article" { 753 888 t.Errorf("Expected title 'Integration Test Article', got %s", article.Title) 754 889 } 755 - if article.URL != server.URL { 756 - t.Errorf("Expected URL %s, got %s", server.URL, article.URL) 890 + if article.URL != contentURL { 891 + t.Errorf("Expected URL %s, got %s", contentURL, article.URL) 757 892 } 758 893 if article.MarkdownPath == "" { 759 894 t.Error("Expected non-empty markdown path") ··· 817 952 </body> 818 953 </html>` 819 954 820 - server := newServerWithHtml(contentHTML) 821 - defer server.Close() 822 - 823 - parser, err := NewArticleParser(server.Client()) 955 + parser, err := NewArticleParser(http.DefaultClient) 824 956 if err != nil { 825 957 t.Fatalf("Failed to create parser: %v", err) 826 958 } 827 959 828 960 localhostRule := &ParsingRule{ 829 - Domain: "127.0.0.1", 961 + Domain: "example.com", 830 962 Title: "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]", 831 963 Body: "//blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')]", 832 964 Date: "//meta[@name='citation_date']/@content", 833 965 Author: "//meta[@name='citation_author']/@content", 834 966 } 835 - parser.AddRule("127.0.0.1", localhostRule) 967 + parser.AddRule("example.com", localhostRule) 968 + 969 + contentURL := "https://example.com/metadata" 970 + parser.SetHTTPClient(newMockHTTPClient(t, func(req *http.Request) (*http.Response, error) { 971 + if req.URL.String() == contentURL { 972 + return htmlResponse(http.StatusOK, contentHTML), nil 973 + } 974 + return nil, fmt.Errorf("unexpected request: %s", req.URL.String()) 975 + })) 836 976 837 - content, err := parser.ParseURL(server.URL) 977 + content, err := parser.ParseURL(contentURL) 838 978 if err != nil { 839 979 t.Fatalf("Expected no error, got %v", err) 840 980 } ··· 878 1018 } 879 1019 }) 880 1020 } 1021 + 1022 + type roundTripFunc func(*http.Request) (*http.Response, error) 1023 + 1024 + func (f roundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) { 1025 + return f(req) 1026 + } 1027 + 1028 + func newMockHTTPClient(t *testing.T, fn roundTripFunc) *http.Client { 1029 + t.Helper() 1030 + return &http.Client{Transport: fn} 1031 + } 1032 + 1033 + func htmlResponse(status int, body string) *http.Response { 1034 + return &http.Response{ 1035 + StatusCode: status, 1036 + Header: http.Header{"Content-Type": []string{"text/html; charset=utf-8"}}, 1037 + Body: io.NopCloser(strings.NewReader(body)), 1038 + } 1039 + } 1040 + 1041 + func withDefaultHTTPClient(t *testing.T, fn roundTripFunc) { 1042 + t.Helper() 1043 + original := http.DefaultClient.Transport 1044 + http.DefaultClient.Transport = fn 1045 + t.Cleanup(func() { 1046 + http.DefaultClient.Transport = original 1047 + }) 1048 + }

+389

internal/articles/scorer.go

··· 1 + package articles 2 + 3 + import ( 4 + "math" 5 + "regexp" 6 + "strings" 7 + 8 + "golang.org/x/net/html" 9 + ) 10 + 11 + // ContentScore represents the score and metadata for a content node. 12 + type ContentScore struct { 13 + Node *html.Node 14 + Score float64 15 + TextLength int 16 + LinkDensity float64 17 + ParagraphCount int 18 + AncestorDepth int 19 + ConfidenceLevel float64 20 + } 21 + 22 + // Scorer implements Readability-style heuristic scoring for content extraction. 23 + type Scorer struct { 24 + linkDensityWeight float64 25 + classWeightPositive float64 26 + classWeightNegative float64 27 + paragraphWeight float64 28 + ancestorDecayFactor float64 29 + minContentLength int 30 + minScore float64 31 + positivePattern *regexp.Regexp 32 + negativePattern *regexp.Regexp 33 + unlikelyPattern *regexp.Regexp 34 + } 35 + 36 + // NewScorer creates a new Scorer with default Readability.js-inspired weights. 37 + func NewScorer() *Scorer { 38 + return &Scorer{ 39 + linkDensityWeight: -1.0, 40 + classWeightPositive: 25.0, 41 + classWeightNegative: -25.0, 42 + paragraphWeight: 1.0, 43 + ancestorDecayFactor: 0.5, 44 + minContentLength: 140, 45 + minScore: 20.0, 46 + 47 + positivePattern: regexp.MustCompile(`(?i)(article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story)`), 48 + negativePattern: regexp.MustCompile(`(?i)(combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|ad-|advertisement|breadcrumb|hidden|nav|menu|header)`), 49 + unlikelyPattern: regexp.MustCompile(`(?i)(banner|cookie|popup|modal)`), 50 + } 51 + } 52 + 53 + // ScoreNode calculates a content score for the given node based on multiple heuristics. 54 + // This implements the core Readability scoring algorithm. 55 + func (s *Scorer) ScoreNode(node *html.Node) *ContentScore { 56 + if node == nil || node.Type != html.ElementNode { 57 + return nil 58 + } 59 + 60 + score := &ContentScore{ 61 + Node: node, 62 + Score: 0.0, 63 + AncestorDepth: s.calculateDepth(node), 64 + } 65 + 66 + score.Score = s.getTagScore(node.Data) 67 + score.Score += s.getClassIdScore(node) 68 + 69 + score.TextLength = s.calculateTextLength(node) 70 + score.LinkDensity = s.calculateLinkDensity(node) 71 + score.ParagraphCount = s.countParagraphs(node) 72 + 73 + score.Score += score.LinkDensity * s.linkDensityWeight 74 + score.Score += float64(score.ParagraphCount) * s.paragraphWeight 75 + score.Score += s.getTextLengthScore(score.TextLength) 76 + 77 + score.ConfidenceLevel = s.calculateConfidence(score) 78 + return score 79 + } 80 + 81 + // getTagScore returns a base score based on the HTML tag type. 82 + // Some tags are more likely to contain main content than others. 83 + func (s *Scorer) getTagScore(tagName string) float64 { 84 + switch strings.ToLower(tagName) { 85 + case "article": 86 + return 30.0 87 + case "section": 88 + return 15.0 89 + case "div": 90 + return 5.0 91 + case "main": 92 + return 40.0 93 + case "p": 94 + return 3.0 95 + case "pre", "td", "blockquote": 96 + return 3.0 97 + case "address", "ol", "ul", "dl", "dd", "dt", "li", "form": 98 + return -3.0 99 + case "h1", "h2", "h3", "h4", "h5", "h6", "th": 100 + return -5.0 101 + default: 102 + return 0.0 103 + } 104 + } 105 + 106 + // getClassIdScore analyzes class and ID attributes for positive/negative indicators. 107 + // Returns a positive score for content-like names, negative for navigation/ads. 108 + func (s *Scorer) getClassIdScore(node *html.Node) float64 { 109 + score := 0.0 110 + classID := s.getClassAndID(node) 111 + 112 + if classID == "" { 113 + return 0.0 114 + } 115 + 116 + if s.unlikelyPattern.MatchString(classID) { 117 + return -50.0 118 + } 119 + 120 + if s.negativePattern.MatchString(classID) { 121 + score += s.classWeightNegative 122 + } 123 + 124 + if s.positivePattern.MatchString(classID) { 125 + score += s.classWeightPositive 126 + } 127 + 128 + return score 129 + } 130 + 131 + // getClassAndID concatenates class and ID attributes for pattern matching. 132 + func (s *Scorer) getClassAndID(node *html.Node) string { 133 + var parts []string 134 + 135 + for _, attr := range node.Attr { 136 + if attr.Key == "class" || attr.Key == "id" { 137 + parts = append(parts, attr.Val) 138 + } 139 + } 140 + 141 + return strings.Join(parts, " ") 142 + } 143 + 144 + // calculateTextLength returns the total text length within the node. 145 + func (s *Scorer) calculateTextLength(node *html.Node) int { 146 + text := s.getInnerText(node) 147 + return len(strings.TrimSpace(text)) 148 + } 149 + 150 + // calculateLinkDensity calculates the ratio of link text to total text. 151 + // Higher link density indicates navigation or related links, not main content. 152 + func (s *Scorer) calculateLinkDensity(node *html.Node) float64 { 153 + totalText := s.getInnerText(node) 154 + linkText := s.getLinkText(node) 155 + 156 + totalLen := len(strings.TrimSpace(totalText)) 157 + linkLen := len(strings.TrimSpace(linkText)) 158 + 159 + if totalLen == 0 { 160 + return 0.0 161 + } 162 + 163 + return float64(linkLen) / float64(totalLen) 164 + } 165 + 166 + // getInnerText extracts all text content from a node and its descendants. 167 + func (s *Scorer) getInnerText(node *html.Node) string { 168 + var buf strings.Builder 169 + s.extractText(node, &buf) 170 + return buf.String() 171 + } 172 + 173 + // extractText recursively extracts text from a node tree. 174 + func (s *Scorer) extractText(node *html.Node, buf *strings.Builder) { 175 + if node == nil { 176 + return 177 + } 178 + 179 + if node.Type == html.TextNode { 180 + buf.WriteString(node.Data) 181 + buf.WriteString(" ") 182 + return 183 + } 184 + 185 + if node.Type == html.ElementNode { 186 + tag := strings.ToLower(node.Data) 187 + if tag == "script" || tag == "style" || tag == "noscript" { 188 + return 189 + } 190 + } 191 + 192 + for child := node.FirstChild; child != nil; child = child.NextSibling { 193 + s.extractText(child, buf) 194 + } 195 + } 196 + 197 + // getLinkText extracts text from anchor tags only. 198 + func (s *Scorer) getLinkText(node *html.Node) string { 199 + var buf strings.Builder 200 + s.extractLinkText(node, &buf) 201 + return buf.String() 202 + } 203 + 204 + // extractLinkText recursively extracts text from anchor tags. 205 + func (s *Scorer) extractLinkText(node *html.Node, buf *strings.Builder) { 206 + if node == nil { 207 + return 208 + } 209 + 210 + if node.Type == html.ElementNode && strings.ToLower(node.Data) == "a" { 211 + s.extractText(node, buf) 212 + return 213 + } 214 + 215 + for child := node.FirstChild; child != nil; child = child.NextSibling { 216 + s.extractLinkText(child, buf) 217 + } 218 + } 219 + 220 + // countParagraphs counts paragraph elements within the node. 221 + func (s *Scorer) countParagraphs(node *html.Node) int { 222 + count := 0 223 + s.walkParagraphs(node, &count) 224 + return count 225 + } 226 + 227 + // walkParagraphs recursively counts paragraph elements. 228 + func (s *Scorer) walkParagraphs(node *html.Node, count *int) { 229 + if node == nil { 230 + return 231 + } 232 + 233 + if node.Type == html.ElementNode && strings.ToLower(node.Data) == "p" { 234 + *count++ 235 + } 236 + 237 + for child := node.FirstChild; child != nil; child = child.NextSibling { 238 + s.walkParagraphs(child, count) 239 + } 240 + } 241 + 242 + // getTextLengthScore provides a bonus for nodes with substantial text content. 243 + func (s *Scorer) getTextLengthScore(textLen int) float64 { 244 + if textLen < 25 { 245 + return 0.0 246 + } 247 + return math.Log10(float64(textLen)) * 2.0 248 + } 249 + 250 + // calculateDepth calculates how deep in the DOM tree this node is. 251 + func (s *Scorer) calculateDepth(node *html.Node) int { 252 + depth := 0 253 + for n := node.Parent; n != nil; n = n.Parent { 254 + depth++ 255 + } 256 + return depth 257 + } 258 + 259 + // ScoreAncestors propagates scores up the DOM tree with decay. 260 + // This implements the Readability algorithm's ancestor scoring. 261 + func (s *Scorer) ScoreAncestors(scores map[*html.Node]*ContentScore, node *html.Node, baseScore float64) { 262 + if node == nil || baseScore <= 0 { 263 + return 264 + } 265 + 266 + currentScore := baseScore 267 + level := 0 268 + 269 + for parent := node.Parent; parent != nil && level < 5; parent = parent.Parent { 270 + if parent.Type != html.ElementNode { 271 + continue 272 + } 273 + 274 + if _, exists := scores[parent]; !exists { 275 + scores[parent] = s.ScoreNode(parent) 276 + if scores[parent] == nil { 277 + continue 278 + } 279 + } 280 + 281 + decayedScore := currentScore * math.Pow(s.ancestorDecayFactor, float64(level+1)) 282 + scores[parent].Score += decayedScore 283 + level++ 284 + } 285 + } 286 + 287 + // FindTopCandidates identifies the N highest-scoring content candidates. 288 + func (s *Scorer) FindTopCandidates(root *html.Node, n int) []*ContentScore { 289 + if root == nil || n <= 0 { 290 + return nil 291 + } 292 + 293 + scores := make(map[*html.Node]*ContentScore) 294 + s.scoreTree(root, scores) 295 + 296 + var candidates []*ContentScore 297 + for _, score := range scores { 298 + if score.Score >= s.minScore && score.TextLength >= s.minContentLength { 299 + candidates = append(candidates, score) 300 + } 301 + } 302 + 303 + for i := 0; i < len(candidates); i++ { 304 + for j := i + 1; j < len(candidates); j++ { 305 + if candidates[j].Score > candidates[i].Score { 306 + candidates[i], candidates[j] = candidates[j], candidates[i] 307 + } 308 + } 309 + } 310 + 311 + if len(candidates) > n { 312 + candidates = candidates[:n] 313 + } 314 + 315 + return candidates 316 + } 317 + 318 + // scoreTree recursively scores all nodes in the tree. 319 + func (s *Scorer) scoreTree(node *html.Node, scores map[*html.Node]*ContentScore) { 320 + if node == nil { 321 + return 322 + } 323 + 324 + if node.Type == html.ElementNode { 325 + tag := strings.ToLower(node.Data) 326 + if tag != "script" && tag != "style" && tag != "noscript" { 327 + score := s.ScoreNode(node) 328 + if score != nil && score.Score > 0 { 329 + scores[node] = score 330 + s.ScoreAncestors(scores, node, score.Score) 331 + } 332 + } 333 + } 334 + 335 + for child := node.FirstChild; child != nil; child = child.NextSibling { 336 + s.scoreTree(child, scores) 337 + } 338 + } 339 + 340 + // calculateConfidence estimates how confident we are in this content selection (between 0 & 1). 341 + func (s *Scorer) calculateConfidence(score *ContentScore) float64 { 342 + if score == nil { 343 + return 0.0 344 + } 345 + 346 + confidence := 0.0 347 + 348 + if score.Score > s.minScore*2 { 349 + confidence += 0.3 350 + } else if score.Score > s.minScore { 351 + confidence += 0.15 352 + } 353 + 354 + if score.TextLength > s.minContentLength*3 { 355 + confidence += 0.3 356 + } else if score.TextLength > s.minContentLength { 357 + confidence += 0.15 358 + } 359 + 360 + if score.LinkDensity < 0.2 { 361 + confidence += 0.2 362 + } else if score.LinkDensity < 0.4 { 363 + confidence += 0.1 364 + } 365 + 366 + if score.ParagraphCount >= 3 { 367 + confidence += 0.2 368 + } else if score.ParagraphCount >= 1 { 369 + confidence += 0.1 370 + } 371 + 372 + if confidence > 1.0 { 373 + confidence = 1.0 374 + } 375 + 376 + return confidence 377 + } 378 + 379 + // IsProbablyReadable determines if a document is likely to have extractable content. 380 + // This is inspired by Readability.js's isProbablyReaderable function. 381 + func (s *Scorer) IsProbablyReadable(doc *html.Node) bool { 382 + if doc == nil { 383 + return false 384 + } 385 + 386 + paragraphCount := s.countParagraphs(doc) 387 + textLength := s.calculateTextLength(doc) 388 + return paragraphCount >= 3 && textLength >= s.minContentLength 389 + }

+421

internal/articles/scorer_test.go

··· 1 + package articles 2 + 3 + import ( 4 + "strings" 5 + "testing" 6 + 7 + "golang.org/x/net/html" 8 + ) 9 + 10 + func parseHTML(htmlStr string) *html.Node { 11 + doc, err := html.Parse(strings.NewReader(htmlStr)) 12 + if err != nil { 13 + return nil 14 + } 15 + return doc 16 + } 17 + 18 + func findElement(node *html.Node, tagName string) *html.Node { 19 + if node == nil { 20 + return nil 21 + } 22 + 23 + if node.Type == html.ElementNode && strings.EqualFold(node.Data, tagName) { 24 + return node 25 + } 26 + 27 + for child := node.FirstChild; child != nil; child = child.NextSibling { 28 + if result := findElement(child, tagName); result != nil { 29 + return result 30 + } 31 + } 32 + 33 + return nil 34 + } 35 + 36 + func findElementWithClass(node *html.Node, className string) *html.Node { 37 + if node == nil { 38 + return nil 39 + } 40 + 41 + if node.Type == html.ElementNode { 42 + for _, attr := range node.Attr { 43 + if attr.Key == "class" && strings.Contains(attr.Val, className) { 44 + return node 45 + } 46 + } 47 + } 48 + 49 + for child := node.FirstChild; child != nil; child = child.NextSibling { 50 + if result := findElementWithClass(child, className); result != nil { 51 + return result 52 + } 53 + } 54 + 55 + return nil 56 + } 57 + 58 + func TestScorer(t *testing.T) { 59 + t.Run("NewScorer", func(t *testing.T) { 60 + t.Run("creates scorer with default weights", func(t *testing.T) { 61 + scorer := NewScorer() 62 + 63 + if scorer == nil { 64 + t.Fatal("Expected scorer to be created, got nil") 65 + } 66 + 67 + if scorer.minContentLength != 140 { 68 + t.Errorf("Expected minContentLength 140, got %d", scorer.minContentLength) 69 + } 70 + 71 + if scorer.minScore != 20.0 { 72 + t.Errorf("Expected minScore 20.0, got %f", scorer.minScore) 73 + } 74 + }) 75 + }) 76 + 77 + t.Run("ScoreNode", func(t *testing.T) { 78 + scorer := NewScorer() 79 + 80 + t.Run("scores article tag highly", func(t *testing.T) { 81 + htmlStr := `<html><body><article class="main-content">Article content</article></body></html>` 82 + doc := parseHTML(htmlStr) 83 + article := findElement(doc, "article") 84 + 85 + score := scorer.ScoreNode(article) 86 + 87 + if score == nil { 88 + t.Fatal("Expected score, got nil") 89 + } 90 + 91 + if score.Score <= 0 { 92 + t.Errorf("Expected positive score for article tag, got %f", score.Score) 93 + } 94 + }) 95 + 96 + t.Run("penalizes navigation elements", func(t *testing.T) { 97 + htmlStr := `<html><body><div class="navigation sidebar">Nav</div></body></html>` 98 + doc := parseHTML(htmlStr) 99 + nav := findElementWithClass(doc, "navigation") 100 + 101 + score := scorer.ScoreNode(nav) 102 + 103 + if score == nil { 104 + t.Fatal("Expected score, got nil") 105 + } 106 + 107 + if score.Score >= 0 { 108 + t.Errorf("Expected negative score for navigation, got %f", score.Score) 109 + } 110 + }) 111 + 112 + t.Run("calculates text length", func(t *testing.T) { 113 + htmlStr := `<html><body><div>This is some test content with multiple words</div></body></html>` 114 + doc := parseHTML(htmlStr) 115 + div := findElement(doc, "div") 116 + 117 + score := scorer.ScoreNode(div) 118 + 119 + if score == nil { 120 + t.Fatal("Expected score, got nil") 121 + } 122 + 123 + if score.TextLength == 0 { 124 + t.Error("Expected non-zero text length") 125 + } 126 + }) 127 + 128 + t.Run("returns nil for text nodes", func(t *testing.T) { 129 + textNode := &html.Node{Type: html.TextNode, Data: "text"} 130 + score := scorer.ScoreNode(textNode) 131 + 132 + if score != nil { 133 + t.Error("Expected nil score for text node") 134 + } 135 + }) 136 + }) 137 + 138 + t.Run("calculateLinkDensity", func(t *testing.T) { 139 + scorer := NewScorer() 140 + 141 + t.Run("calculates high link density", func(t *testing.T) { 142 + htmlStr := `<html><body><div><a href="#">link1</a> <a href="#">link2</a></div></body></html>` 143 + doc := parseHTML(htmlStr) 144 + div := findElement(doc, "div") 145 + 146 + density := scorer.calculateLinkDensity(div) 147 + 148 + if density < 0.5 { 149 + t.Errorf("Expected high link density (>0.5), got %f", density) 150 + } 151 + }) 152 + 153 + t.Run("calculates low link density", func(t *testing.T) { 154 + htmlStr := `<html><body><div>Lots of regular text content here with just <a href="#">one link</a> in it</div></body></html>` 155 + doc := parseHTML(htmlStr) 156 + div := findElement(doc, "div") 157 + 158 + density := scorer.calculateLinkDensity(div) 159 + 160 + if density > 0.3 { 161 + t.Errorf("Expected low link density (<0.3), got %f", density) 162 + } 163 + }) 164 + 165 + t.Run("returns zero for empty content", func(t *testing.T) { 166 + htmlStr := `<html><body><div></div></body></html>` 167 + doc := parseHTML(htmlStr) 168 + div := findElement(doc, "div") 169 + 170 + density := scorer.calculateLinkDensity(div) 171 + 172 + if density != 0.0 { 173 + t.Errorf("Expected zero density for empty content, got %f", density) 174 + } 175 + }) 176 + }) 177 + 178 + t.Run("getClassIdScore", func(t *testing.T) { 179 + scorer := NewScorer() 180 + 181 + t.Run("positive score for content class", func(t *testing.T) { 182 + node := &html.Node{ 183 + Type: html.ElementNode, 184 + Data: "div", 185 + Attr: []html.Attribute{{Key: "class", Val: "article-content"}}, 186 + } 187 + 188 + score := scorer.getClassIdScore(node) 189 + 190 + if score <= 0 { 191 + t.Errorf("Expected positive score for content class, got %f", score) 192 + } 193 + }) 194 + 195 + t.Run("negative score for sidebar class", func(t *testing.T) { 196 + node := &html.Node{ 197 + Type: html.ElementNode, 198 + Data: "div", 199 + Attr: []html.Attribute{{Key: "class", Val: "sidebar"}}, 200 + } 201 + 202 + score := scorer.getClassIdScore(node) 203 + 204 + if score >= 0 { 205 + t.Errorf("Expected negative score for sidebar class, got %f", score) 206 + } 207 + }) 208 + 209 + t.Run("strong negative score for banner", func(t *testing.T) { 210 + node := &html.Node{ 211 + Type: html.ElementNode, 212 + Data: "div", 213 + Attr: []html.Attribute{{Key: "id", Val: "banner"}}, 214 + } 215 + 216 + score := scorer.getClassIdScore(node) 217 + 218 + if score > -30 { 219 + t.Errorf("Expected strong negative score for banner, got %f", score) 220 + } 221 + }) 222 + }) 223 + 224 + t.Run("countParagraphs", func(t *testing.T) { 225 + scorer := NewScorer() 226 + 227 + t.Run("counts multiple paragraphs", func(t *testing.T) { 228 + htmlStr := `<html><body><div><p>First</p><p>Second</p><p>Third</p></div></body></html>` 229 + doc := parseHTML(htmlStr) 230 + div := findElement(doc, "div") 231 + 232 + count := scorer.countParagraphs(div) 233 + 234 + if count != 3 { 235 + t.Errorf("Expected 3 paragraphs, got %d", count) 236 + } 237 + }) 238 + 239 + t.Run("returns zero for no paragraphs", func(t *testing.T) { 240 + htmlStr := `<html><body><div>Just text</div></body></html>` 241 + doc := parseHTML(htmlStr) 242 + div := findElement(doc, "div") 243 + 244 + count := scorer.countParagraphs(div) 245 + 246 + if count != 0 { 247 + t.Errorf("Expected 0 paragraphs, got %d", count) 248 + } 249 + }) 250 + }) 251 + 252 + t.Run("FindTopCandidates", func(t *testing.T) { 253 + scorer := NewScorer() 254 + 255 + t.Run("finds article with substantial content", func(t *testing.T) { 256 + htmlStr := `<html><body> 257 + <article class="main-content"> 258 + <p>This is a long paragraph with substantial content that should score well in the readability algorithm.</p> 259 + <p>This is another paragraph with more content to increase the score.</p> 260 + <p>And a third paragraph to ensure we have enough text and structure.</p> 261 + </article> 262 + <aside class="sidebar"> 263 + <a href="#">Link</a> 264 + </aside> 265 + </body></html>` 266 + doc := parseHTML(htmlStr) 267 + 268 + candidates := scorer.FindTopCandidates(doc, 5) 269 + 270 + if len(candidates) == 0 { 271 + t.Fatal("Expected to find candidates") 272 + } 273 + 274 + topScore := candidates[0] 275 + if topScore.Score <= 0 { 276 + t.Errorf("Expected positive score for top candidate, got %f", topScore.Score) 277 + } 278 + 279 + if topScore.ParagraphCount < 3 { 280 + t.Errorf("Expected top candidate to contain paragraphs, got %d", topScore.ParagraphCount) 281 + } 282 + }) 283 + 284 + t.Run("filters out low-scoring nodes", func(t *testing.T) { 285 + htmlStr := `<html><body> 286 + <div class="ad">Short ad</div> 287 + <nav class="menu"><a href="#">Link</a></nav> 288 + </body></html>` 289 + doc := parseHTML(htmlStr) 290 + 291 + candidates := scorer.FindTopCandidates(doc, 5) 292 + 293 + for _, candidate := range candidates { 294 + if candidate.Score < scorer.minScore { 295 + t.Errorf("Expected all candidates to meet minimum score, got %f", candidate.Score) 296 + } 297 + if candidate.TextLength < scorer.minContentLength { 298 + t.Errorf("Expected all candidates to meet minimum length, got %d", candidate.TextLength) 299 + } 300 + } 301 + }) 302 + 303 + t.Run("returns empty for nil root", func(t *testing.T) { 304 + candidates := scorer.FindTopCandidates(nil, 5) 305 + 306 + if candidates != nil { 307 + t.Error("Expected nil for nil root") 308 + } 309 + }) 310 + }) 311 + 312 + t.Run("calculateConfidence", func(t *testing.T) { 313 + scorer := NewScorer() 314 + 315 + t.Run("high confidence for good content", func(t *testing.T) { 316 + score := &ContentScore{ 317 + Score: 60.0, 318 + TextLength: 500, 319 + LinkDensity: 0.1, 320 + ParagraphCount: 5, 321 + } 322 + 323 + confidence := scorer.calculateConfidence(score) 324 + 325 + if confidence < 0.5 { 326 + t.Errorf("Expected high confidence (>0.5) for good content, got %f", confidence) 327 + } 328 + 329 + if confidence > 1.0 { 330 + t.Errorf("Expected confidence <= 1.0, got %f", confidence) 331 + } 332 + }) 333 + 334 + t.Run("low confidence for poor content", func(t *testing.T) { 335 + score := &ContentScore{ 336 + Score: 10.0, 337 + TextLength: 50, 338 + LinkDensity: 0.8, 339 + ParagraphCount: 0, 340 + } 341 + 342 + confidence := scorer.calculateConfidence(score) 343 + 344 + if confidence > 0.3 { 345 + t.Errorf("Expected low confidence (<0.3) for poor content, got %f", confidence) 346 + } 347 + }) 348 + 349 + t.Run("returns zero for nil score", func(t *testing.T) { 350 + confidence := scorer.calculateConfidence(nil) 351 + 352 + if confidence != 0.0 { 353 + t.Errorf("Expected 0.0 for nil score, got %f", confidence) 354 + } 355 + }) 356 + }) 357 + 358 + t.Run("IsProbablyReadable", func(t *testing.T) { 359 + scorer := NewScorer() 360 + 361 + t.Run("returns true for readable document", func(t *testing.T) { 362 + htmlStr := `<html><body> 363 + <article> 364 + <p>First paragraph with sufficient text content to be considered readable.</p> 365 + <p>Second paragraph with more text.</p> 366 + <p>Third paragraph with additional content.</p> 367 + </article> 368 + </body></html>` 369 + doc := parseHTML(htmlStr) 370 + 371 + readable := scorer.IsProbablyReadable(doc) 372 + 373 + if !readable { 374 + t.Error("Expected document to be readable") 375 + } 376 + }) 377 + 378 + t.Run("returns false for short document", func(t *testing.T) { 379 + htmlStr := `<html><body><div>Short</div></body></html>` 380 + doc := parseHTML(htmlStr) 381 + 382 + readable := scorer.IsProbablyReadable(doc) 383 + 384 + if readable { 385 + t.Error("Expected document to not be readable") 386 + } 387 + }) 388 + 389 + t.Run("returns false for nil document", func(t *testing.T) { 390 + readable := scorer.IsProbablyReadable(nil) 391 + 392 + if readable { 393 + t.Error("Expected nil document to not be readable") 394 + } 395 + }) 396 + }) 397 + 398 + t.Run("ScoreAncestors", func(t *testing.T) { 399 + scorer := NewScorer() 400 + 401 + t.Run("propagates score to parent nodes", func(t *testing.T) { 402 + htmlStr := `<html><body><div><article><p>Content</p></article></div></body></html>` 403 + doc := parseHTML(htmlStr) 404 + p := findElement(doc, "p") 405 + 406 + scores := make(map[*html.Node]*ContentScore) 407 + scores[p] = &ContentScore{Node: p, Score: 10.0} 408 + 409 + scorer.ScoreAncestors(scores, p, 100.0) 410 + 411 + article := findElement(doc, "article") 412 + if scores[article] == nil { 413 + t.Error("Expected article to receive propagated score") 414 + } 415 + 416 + if scores[article].Score <= 0 { 417 + t.Errorf("Expected positive propagated score, got %f", scores[article].Score) 418 + } 419 + }) 420 + }) 421 + }

+9 -9

internal/services/services_test.go

··· 4 4 "context" 5 5 "encoding/json" 6 6 "net/http" 7 - "net/http/httptest" 8 7 "strings" 9 8 "testing" 10 9 "time" 11 10 11 + "github.com/stormlightlabs/noteleaf/internal/shared" 12 12 "golang.org/x/time/rate" 13 13 ) 14 14 ··· 61 61 }, 62 62 } 63 63 64 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 64 + server := shared.NewHTTPTestServer(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 65 65 if r.URL.Path != "/search.json" { 66 66 t.Errorf("Expected path /search.json, got %s", r.URL.Path) 67 67 } ··· 104 104 }) 105 105 106 106 t.Run("handles API error", func(t *testing.T) { 107 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 107 + server := shared.NewHTTPTestServer(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 108 108 w.WriteHeader(http.StatusInternalServerError) 109 109 })) 110 110 defer server.Close() ··· 121 121 }) 122 122 123 123 t.Run("handles malformed JSON", func(t *testing.T) { 124 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 124 + server := shared.NewHTTPTestServer(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 125 125 w.Header().Set("Content-Type", "application/json") 126 126 w.Write([]byte("invalid json")) 127 127 })) ··· 175 175 Covers: []int{8739161, 8739162}, 176 176 } 177 177 178 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 178 + server := shared.NewHTTPTestServer(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 179 179 if !strings.HasPrefix(r.URL.Path, "/works/") { 180 180 t.Errorf("Expected path to start with /works/, got %s", r.URL.Path) 181 181 } ··· 218 218 }) 219 219 220 220 t.Run("handles not found", func(t *testing.T) { 221 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 221 + server := shared.NewHTTPTestServer(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 222 222 w.WriteHeader(http.StatusNotFound) 223 223 })) 224 224 defer server.Close() ··· 235 235 }) 236 236 237 237 t.Run("handles API error", func(t *testing.T) { 238 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 238 + server := shared.NewHTTPTestServer(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 239 239 w.WriteHeader(http.StatusInternalServerError) 240 240 })) 241 241 defer server.Close() ··· 254 254 255 255 t.Run("Check", func(t *testing.T) { 256 256 t.Run("successful check", func(t *testing.T) { 257 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 257 + server := shared.NewHTTPTestServer(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 258 258 if r.URL.Path != "/search.json" { 259 259 t.Errorf("Expected path /search.json, got %s", r.URL.Path) 260 260 } ··· 286 286 }) 287 287 288 288 t.Run("handles API failure", func(t *testing.T) { 289 - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 289 + server := shared.NewHTTPTestServer(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 290 290 w.WriteHeader(http.StatusServiceUnavailable) 291 291 })) 292 292 defer server.Close()

+29 -1

internal/shared/test_utilities.go

··· 3 3 4 4 import ( 5 5 "encoding/json" 6 + "net" 6 7 "net/http" 7 8 "net/http/httptest" 8 9 "os" ··· 10 11 "testing" 11 12 "time" 12 13 ) 14 + 15 + func NewHTTPTestServer(t testing.TB, handler http.Handler) *httptest.Server { 16 + t.Helper() 17 + server, err := startHTTPTestServer(handler) 18 + if err != nil { 19 + t.Fatalf("failed to start HTTP test server: %v", err) 20 + } 21 + return server 22 + } 23 + 24 + func startHTTPTestServer(handler http.Handler) (*httptest.Server, error) { 25 + ln, err := net.Listen("tcp4", "127.0.0.1:0") 26 + if err != nil { 27 + return nil, err 28 + } 29 + server := &httptest.Server{ 30 + Listener: ln, 31 + Config: &http.Server{Handler: handler}, 32 + } 33 + server.Start() 34 + return server, nil 35 + } 13 36 14 37 func CreateTempDir(p string, t *testing.T) (string, func()) { 15 38 t.Helper() ··· 144 167 145 168 // WithHandler sets up the mock server with a custom handler 146 169 func (m *HTTPMockServer) WithHandler(handler http.HandlerFunc) *HTTPMockServer { 147 - m.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 170 + server, err := startHTTPTestServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 148 171 m.requests = append(m.requests, r) 149 172 handler(w, r) 150 173 })) 174 + if err != nil { 175 + panic(err) 176 + } 177 + 178 + m.server = server 151 179 return m 152 180 } 153 181