Regular paragraph with actual content that should remain.
`
doc := parseHTML(htmlStr)
cleaned := extractor.cleanDocument(doc)
p := findElement(cleaned, "p")
if p == nil {
t.Error("Expected paragraph to remain")
}
})
})
t.Run("extractTextContent", func(t *testing.T) {
extractor := NewHeuristicExtractor()
t.Run("extracts text with basic formatting", func(t *testing.T) {
htmlStr := `
First paragraph
Second paragraph
`
doc := parseHTML(htmlStr)
div := findElement(doc, "div")
text := extractor.extractTextContent(div)
if !strings.Contains(text, "First paragraph") {
t.Error("Expected text to contain first paragraph")
}
if !strings.Contains(text, "Second paragraph") {
t.Error("Expected text to contain second paragraph")
}
})
t.Run("formats list items with bullets", func(t *testing.T) {
htmlStr := `
Item 1
Item 2
`
doc := parseHTML(htmlStr)
ul := findElement(doc, "ul")
text := extractor.extractTextContent(ul)
if !strings.Contains(text, "•") {
t.Error("Expected text to contain bullet points")
}
})
t.Run("returns empty string for nil node", func(t *testing.T) {
text := extractor.extractTextContent(nil)
if text != "" {
t.Error("Expected empty string for nil node")
}
})
})
t.Run("CompareWithXPath", func(t *testing.T) {
extractor := NewHeuristicExtractor()
t.Run("high confidence when XPath and heuristics agree", func(t *testing.T) {
htmlStr := `
This is substantial content that both methods should find.
Another paragraph with more details and information.
And a third paragraph for good measure and completeness.
`
doc := parseHTML(htmlStr)
article := findElement(doc, "article")
result := extractor.CompareWithXPath(doc, article)
if result == nil {
t.Fatal("Expected result, got nil")
}
if result.Confidence < 0.8 {
t.Errorf("Expected high confidence when methods agree, got %f", result.Confidence)
}
if !strings.Contains(result.ExtractionMethod, "dual") && !strings.Contains(result.ExtractionMethod, "validated") {
t.Errorf("Expected dual validation method, got %s", result.ExtractionMethod)
}
})
t.Run("prefers XPath when it extracts more content", func(t *testing.T) {
htmlStr := `
Short content
This is additional content that XPath found but heuristics might miss.
Even more content here to make a significant difference in length.
And yet another paragraph to ensure XPath extraction is substantially longer.
`
doc := parseHTML(htmlStr)
// XPath would get more content
body := findElement(doc, "body")
result := extractor.CompareWithXPath(doc, body)
if result == nil {
t.Fatal("Expected result, got nil")
}
// Should prefer one method over the other
if result.ExtractionMethod == "heuristic" {
t.Errorf("Expected method preference, got %s", result.ExtractionMethod)
}
})
t.Run("uses heuristics when XPath node is nil", func(t *testing.T) {
htmlStr := `
Content that heuristics should find on its own.
Additional paragraph for sufficient content length.
Third paragraph to meet minimum requirements.
`
doc := parseHTML(htmlStr)
result := extractor.CompareWithXPath(doc, nil)
if result == nil {
t.Fatal("Expected result, got nil")
}
if result.ExtractionMethod != "heuristic" {
t.Errorf("Expected heuristic method when XPath is nil, got %s", result.ExtractionMethod)
}
})
t.Run("returns nil for nil document", func(t *testing.T) {
result := extractor.CompareWithXPath(nil, nil)
if result != nil {
t.Error("Expected nil for nil document")
}
})
})
t.Run("calculateSimilarity", func(t *testing.T) {
extractor := NewHeuristicExtractor()
t.Run("returns high similarity for identical text", func(t *testing.T) {
text := "This is some test content"
similarity := extractor.calculateSimilarity(text, text)
if similarity < 0.9 {
t.Errorf("Expected high similarity for identical text, got %f", similarity)
}
})
t.Run("returns low similarity for different text", func(t *testing.T) {
text1 := "This is the first piece of content"
text2 := "Completely different words and phrases"
similarity := extractor.calculateSimilarity(text1, text2)
if similarity > 0.3 {
t.Errorf("Expected low similarity for different text, got %f", similarity)
}
})
t.Run("returns zero for empty strings", func(t *testing.T) {
similarity := extractor.calculateSimilarity("text", "")
if similarity != 0.0 {
t.Errorf("Expected zero similarity for empty string, got %f", similarity)
}
})
t.Run("returns one for both empty", func(t *testing.T) {
similarity := extractor.calculateSimilarity("", "")
if similarity != 1.0 {
t.Errorf("Expected 1.0 similarity for both empty, got %f", similarity)
}
})
})
t.Run("ExtractWithSemanticHTML", func(t *testing.T) {
extractor := NewHeuristicExtractor()
t.Run("extracts from article tag", func(t *testing.T) {
htmlStr := `
This is the main article content that should be extracted.
Second paragraph of the article with more information.
Third paragraph to provide sufficient content length.
`
doc := parseHTML(htmlStr)
result := extractor.ExtractWithSemanticHTML(doc)
if result == nil {
t.Fatal("Expected result, got nil")
}
if result.ExtractionMethod != "semantic-html" {
t.Errorf("Expected semantic-html method, got %s", result.ExtractionMethod)
}
if !strings.Contains(result.Content, "main article content") {
t.Error("Expected content from article tag")
}
if result.Confidence < 0.85 {
t.Errorf("Expected high confidence for semantic HTML, got %f", result.Confidence)
}
})
t.Run("extracts from main tag", func(t *testing.T) {
htmlStr := `
Header
This is the main content area with sufficient text.
Additional content paragraph with more details.
Third paragraph for completeness and length.
`
doc := parseHTML(htmlStr)
result := extractor.ExtractWithSemanticHTML(doc)
if result == nil {
t.Fatal("Expected result, got nil")
}
if result.ExtractionMethod != "semantic-html" {
t.Errorf("Expected semantic-html method, got %s", result.ExtractionMethod)
}
if !strings.Contains(result.Content, "main content area") {
t.Error("Expected content from main tag")
}
})
t.Run("falls back to heuristics without semantic tags", func(t *testing.T) {
htmlStr := `
Content in a regular div without semantic HTML tags.
Second paragraph with additional information.
Third paragraph for sufficient content.
`
doc := parseHTML(htmlStr)
result := extractor.ExtractWithSemanticHTML(doc)
if result == nil {
t.Fatal("Expected result, got nil")
}
if result.ExtractionMethod == "semantic-html" {
t.Error("Should not use semantic-html method without semantic tags")
}
})
t.Run("returns nil for nil document", func(t *testing.T) {
result := extractor.ExtractWithSemanticHTML(nil)
if result != nil {
t.Error("Expected nil for nil document")
}
})
})
t.Run("isBlockElement", func(t *testing.T) {
extractor := NewHeuristicExtractor()
t.Run("identifies block elements", func(t *testing.T) {
blockTags := []string{"p", "div", "article", "h1", "section"}
for _, tag := range blockTags {
if !extractor.isBlockElement(tag) {
t.Errorf("Expected %s to be a block element", tag)
}
}
})
t.Run("identifies non-block elements", func(t *testing.T) {
inlineTags := []string{"span", "a", "em", "strong", "code"}
for _, tag := range inlineTags {
if extractor.isBlockElement(tag) {
t.Errorf("Expected %s to not be a block element", tag)
}
}
})
})
}