package articles import ( "strings" "testing" "golang.org/x/net/html" ) func parseHTML(htmlStr string) *html.Node { doc, err := html.Parse(strings.NewReader(htmlStr)) if err != nil { return nil } return doc } func findElement(node *html.Node, tagName string) *html.Node { if node == nil { return nil } if node.Type == html.ElementNode && strings.EqualFold(node.Data, tagName) { return node } for child := node.FirstChild; child != nil; child = child.NextSibling { if result := findElement(child, tagName); result != nil { return result } } return nil } func findElementWithClass(node *html.Node, className string) *html.Node { if node == nil { return nil } if node.Type == html.ElementNode { for _, attr := range node.Attr { if attr.Key == "class" && strings.Contains(attr.Val, className) { return node } } } for child := node.FirstChild; child != nil; child = child.NextSibling { if result := findElementWithClass(child, className); result != nil { return result } } return nil } func TestScorer(t *testing.T) { t.Run("NewScorer", func(t *testing.T) { t.Run("creates scorer with default weights", func(t *testing.T) { scorer := NewScorer() if scorer == nil { t.Fatal("Expected scorer to be created, got nil") } if scorer.minContentLength != 140 { t.Errorf("Expected minContentLength 140, got %d", scorer.minContentLength) } if scorer.minScore != 20.0 { t.Errorf("Expected minScore 20.0, got %f", scorer.minScore) } }) }) t.Run("ScoreNode", func(t *testing.T) { scorer := NewScorer() t.Run("scores article tag highly", func(t *testing.T) { htmlStr := `
Article content
` doc := parseHTML(htmlStr) article := findElement(doc, "article") score := scorer.ScoreNode(article) if score == nil { t.Fatal("Expected score, got nil") } if score.Score <= 0 { t.Errorf("Expected positive score for article tag, got %f", score.Score) } }) t.Run("penalizes navigation elements", func(t *testing.T) { htmlStr := `` doc := parseHTML(htmlStr) nav := findElementWithClass(doc, "navigation") score := scorer.ScoreNode(nav) if score == nil { t.Fatal("Expected score, got nil") } if score.Score >= 0 { t.Errorf("Expected negative score for navigation, got %f", score.Score) } }) t.Run("calculates text length", func(t *testing.T) { htmlStr := `
This is some test content with multiple words
` doc := parseHTML(htmlStr) div := findElement(doc, "div") score := scorer.ScoreNode(div) if score == nil { t.Fatal("Expected score, got nil") } if score.TextLength == 0 { t.Error("Expected non-zero text length") } }) t.Run("returns nil for text nodes", func(t *testing.T) { textNode := &html.Node{Type: html.TextNode, Data: "text"} score := scorer.ScoreNode(textNode) if score != nil { t.Error("Expected nil score for text node") } }) }) t.Run("calculateLinkDensity", func(t *testing.T) { scorer := NewScorer() t.Run("calculates high link density", func(t *testing.T) { htmlStr := `
link1 link2
` doc := parseHTML(htmlStr) div := findElement(doc, "div") density := scorer.calculateLinkDensity(div) if density < 0.5 { t.Errorf("Expected high link density (>0.5), got %f", density) } }) t.Run("calculates low link density", func(t *testing.T) { htmlStr := `
Lots of regular text content here with just one link in it
` doc := parseHTML(htmlStr) div := findElement(doc, "div") density := scorer.calculateLinkDensity(div) if density > 0.3 { t.Errorf("Expected low link density (<0.3), got %f", density) } }) t.Run("returns zero for empty content", func(t *testing.T) { htmlStr := `
` doc := parseHTML(htmlStr) div := findElement(doc, "div") density := scorer.calculateLinkDensity(div) if density != 0.0 { t.Errorf("Expected zero density for empty content, got %f", density) } }) }) t.Run("getClassIdScore", func(t *testing.T) { scorer := NewScorer() t.Run("positive score for content class", func(t *testing.T) { node := &html.Node{ Type: html.ElementNode, Data: "div", Attr: []html.Attribute{{Key: "class", Val: "article-content"}}, } score := scorer.getClassIdScore(node) if score <= 0 { t.Errorf("Expected positive score for content class, got %f", score) } }) t.Run("negative score for sidebar class", func(t *testing.T) { node := &html.Node{ Type: html.ElementNode, Data: "div", Attr: []html.Attribute{{Key: "class", Val: "sidebar"}}, } score := scorer.getClassIdScore(node) if score >= 0 { t.Errorf("Expected negative score for sidebar class, got %f", score) } }) t.Run("strong negative score for banner", func(t *testing.T) { node := &html.Node{ Type: html.ElementNode, Data: "div", Attr: []html.Attribute{{Key: "id", Val: "banner"}}, } score := scorer.getClassIdScore(node) if score > -30 { t.Errorf("Expected strong negative score for banner, got %f", score) } }) }) t.Run("countParagraphs", func(t *testing.T) { scorer := NewScorer() t.Run("counts multiple paragraphs", func(t *testing.T) { htmlStr := `

First

Second

Third

` doc := parseHTML(htmlStr) div := findElement(doc, "div") count := scorer.countParagraphs(div) if count != 3 { t.Errorf("Expected 3 paragraphs, got %d", count) } }) t.Run("returns zero for no paragraphs", func(t *testing.T) { htmlStr := `
Just text
` doc := parseHTML(htmlStr) div := findElement(doc, "div") count := scorer.countParagraphs(div) if count != 0 { t.Errorf("Expected 0 paragraphs, got %d", count) } }) }) t.Run("FindTopCandidates", func(t *testing.T) { scorer := NewScorer() t.Run("finds article with substantial content", func(t *testing.T) { htmlStr := `

This is a long paragraph with substantial content that should score well in the readability algorithm.

This is another paragraph with more content to increase the score.

And a third paragraph to ensure we have enough text and structure.

` doc := parseHTML(htmlStr) candidates := scorer.FindTopCandidates(doc, 5) if len(candidates) == 0 { t.Fatal("Expected to find candidates") } topScore := candidates[0] if topScore.Score <= 0 { t.Errorf("Expected positive score for top candidate, got %f", topScore.Score) } if topScore.ParagraphCount < 3 { t.Errorf("Expected top candidate to contain paragraphs, got %d", topScore.ParagraphCount) } }) t.Run("filters out low-scoring nodes", func(t *testing.T) { htmlStr := `
Short ad
` doc := parseHTML(htmlStr) candidates := scorer.FindTopCandidates(doc, 5) for _, candidate := range candidates { if candidate.Score < scorer.minScore { t.Errorf("Expected all candidates to meet minimum score, got %f", candidate.Score) } if candidate.TextLength < scorer.minContentLength { t.Errorf("Expected all candidates to meet minimum length, got %d", candidate.TextLength) } } }) t.Run("returns empty for nil root", func(t *testing.T) { candidates := scorer.FindTopCandidates(nil, 5) if candidates != nil { t.Error("Expected nil for nil root") } }) }) t.Run("calculateConfidence", func(t *testing.T) { scorer := NewScorer() t.Run("high confidence for good content", func(t *testing.T) { score := &ContentScore{ Score: 60.0, TextLength: 500, LinkDensity: 0.1, ParagraphCount: 5, } confidence := scorer.calculateConfidence(score) if confidence < 0.5 { t.Errorf("Expected high confidence (>0.5) for good content, got %f", confidence) } if confidence > 1.0 { t.Errorf("Expected confidence <= 1.0, got %f", confidence) } }) t.Run("low confidence for poor content", func(t *testing.T) { score := &ContentScore{ Score: 10.0, TextLength: 50, LinkDensity: 0.8, ParagraphCount: 0, } confidence := scorer.calculateConfidence(score) if confidence > 0.3 { t.Errorf("Expected low confidence (<0.3) for poor content, got %f", confidence) } }) t.Run("returns zero for nil score", func(t *testing.T) { confidence := scorer.calculateConfidence(nil) if confidence != 0.0 { t.Errorf("Expected 0.0 for nil score, got %f", confidence) } }) }) t.Run("IsProbablyReadable", func(t *testing.T) { scorer := NewScorer() t.Run("returns true for readable document", func(t *testing.T) { htmlStr := `

First paragraph with sufficient text content to be considered readable.

Second paragraph with more text.

Third paragraph with additional content.

` doc := parseHTML(htmlStr) readable := scorer.IsProbablyReadable(doc) if !readable { t.Error("Expected document to be readable") } }) t.Run("returns false for short document", func(t *testing.T) { htmlStr := `
Short
` doc := parseHTML(htmlStr) readable := scorer.IsProbablyReadable(doc) if readable { t.Error("Expected document to not be readable") } }) t.Run("returns false for nil document", func(t *testing.T) { readable := scorer.IsProbablyReadable(nil) if readable { t.Error("Expected nil document to not be readable") } }) }) t.Run("ScoreAncestors", func(t *testing.T) { scorer := NewScorer() t.Run("propagates score to parent nodes", func(t *testing.T) { htmlStr := `

Content

` doc := parseHTML(htmlStr) p := findElement(doc, "p") scores := make(map[*html.Node]*ContentScore) scores[p] = &ContentScore{Node: p, Score: 10.0} scorer.ScoreAncestors(scores, p, 100.0) article := findElement(doc, "article") if scores[article] == nil { t.Error("Expected article to receive propagated score") } if scores[article].Score <= 0 { t.Errorf("Expected positive propagated score, got %f", scores[article].Score) } }) }) }