`
doc := parseHTML(htmlStr)
title := extractor.ExtractTitle(doc)
if title != "Article Title from OpenGraph" {
t.Errorf("Expected OpenGraph title, got %q", title)
}
})
t.Run("extracts from title tag", func(t *testing.T) {
htmlStr := `
Page Title from Title Tag
`
doc := parseHTML(htmlStr)
title := extractor.ExtractTitle(doc)
if title != "Page Title from Title Tag" {
t.Errorf("Expected title tag content, got %q", title)
}
})
t.Run("extracts from h1", func(t *testing.T) {
htmlStr := `
Heading Title
`
doc := parseHTML(htmlStr)
title := extractor.ExtractTitle(doc)
if title != "Heading Title" {
t.Errorf("Expected h1 content, got %q", title)
}
})
t.Run("returns empty for nil document", func(t *testing.T) {
title := extractor.ExtractTitle(nil)
if title != "" {
t.Errorf("Expected empty string for nil document, got %q", title)
}
})
t.Run("prioritizes OpenGraph over title tag", func(t *testing.T) {
htmlStr := `
HTML Title
`
doc := parseHTML(htmlStr)
title := extractor.ExtractTitle(doc)
if title != "OpenGraph Title" {
t.Errorf("Expected OpenGraph title to have priority, got %q", title)
}
})
})
t.Run("ExtractAuthor", func(t *testing.T) {
extractor := NewMetadataExtractor()
t.Run("extracts from OpenGraph", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
author := extractor.ExtractAuthor(doc)
if author != "John Doe" {
t.Errorf("Expected OpenGraph author, got %q", author)
}
})
t.Run("extracts from meta tag", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
author := extractor.ExtractAuthor(doc)
if author != "Jane Smith" {
t.Errorf("Expected meta tag author, got %q", author)
}
})
t.Run("extracts from rel=author link", func(t *testing.T) {
htmlStr := `
Bob Johnson
`
doc := parseHTML(htmlStr)
author := extractor.ExtractAuthor(doc)
if author != "Bob Johnson" {
t.Errorf("Expected rel=author link text, got %q", author)
}
})
t.Run("extracts from byline class", func(t *testing.T) {
htmlStr := `
Alice Brown
`
doc := parseHTML(htmlStr)
author := extractor.ExtractAuthor(doc)
if author != "Alice Brown" {
t.Errorf("Expected byline class text, got %q", author)
}
})
t.Run("returns empty for nil document", func(t *testing.T) {
author := extractor.ExtractAuthor(nil)
if author != "" {
t.Errorf("Expected empty string for nil document, got %q", author)
}
})
})
t.Run("ExtractPublishedDate", func(t *testing.T) {
extractor := NewMetadataExtractor()
t.Run("extracts from OpenGraph", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
date := extractor.ExtractPublishedDate(doc)
if date != "2025-01-15T10:00:00Z" {
t.Errorf("Expected OpenGraph date, got %q", date)
}
})
t.Run("extracts from article:published_time", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
date := extractor.ExtractPublishedDate(doc)
if date != "2025-02-20" {
t.Errorf("Expected article:published_time, got %q", date)
}
})
t.Run("extracts from time element", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
date := extractor.ExtractPublishedDate(doc)
if date != "2025-03-25T14:30:00" {
t.Errorf("Expected time element datetime, got %q", date)
}
})
t.Run("returns empty for nil document", func(t *testing.T) {
date := extractor.ExtractPublishedDate(nil)
if date != "" {
t.Errorf("Expected empty string for nil document, got %q", date)
}
})
})
t.Run("ExtractSiteName", func(t *testing.T) {
extractor := NewMetadataExtractor()
t.Run("extracts from OpenGraph", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
siteName := extractor.ExtractSiteName(doc)
if siteName != "Example News" {
t.Errorf("Expected OpenGraph site_name, got %q", siteName)
}
})
t.Run("extracts from application-name", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
siteName := extractor.ExtractSiteName(doc)
if siteName != "Tech Blog" {
t.Errorf("Expected application-name, got %q", siteName)
}
})
t.Run("returns empty for nil document", func(t *testing.T) {
siteName := extractor.ExtractSiteName(nil)
if siteName != "" {
t.Errorf("Expected empty string for nil document, got %q", siteName)
}
})
})
t.Run("ExtractLanguage", func(t *testing.T) {
extractor := NewMetadataExtractor()
t.Run("extracts from html lang attribute", func(t *testing.T) {
htmlStr := ``
doc := parseHTML(htmlStr)
lang := extractor.ExtractLanguage(doc)
if lang != "en-US" {
t.Errorf("Expected html lang attribute, got %q", lang)
}
})
t.Run("extracts from OpenGraph locale", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
lang := extractor.ExtractLanguage(doc)
if lang != "fr-FR" {
t.Errorf("Expected OpenGraph locale, got %q", lang)
}
})
t.Run("returns empty for nil document", func(t *testing.T) {
lang := extractor.ExtractLanguage(nil)
if lang != "" {
t.Errorf("Expected empty string for nil document, got %q", lang)
}
})
})
t.Run("getSchemaOrgField", func(t *testing.T) {
extractor := NewMetadataExtractor()
t.Run("extracts from JSON-LD Article", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
headline := extractor.getSchemaOrgField(doc, "headline")
author := extractor.getSchemaOrgField(doc, "author")
date := extractor.getSchemaOrgField(doc, "datePublished")
if headline != "Test Article" {
t.Errorf("Expected headline from JSON-LD, got %q", headline)
}
if author != "Test Author" {
t.Errorf("Expected author from JSON-LD, got %q", author)
}
if date != "2025-01-15" {
t.Errorf("Expected datePublished from JSON-LD, got %q", date)
}
})
t.Run("extracts from NewsArticle type", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
headline := extractor.getSchemaOrgField(doc, "headline")
if headline != "Breaking News" {
t.Errorf("Expected headline from NewsArticle, got %q", headline)
}
})
t.Run("handles nested author object", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
author := extractor.getSchemaOrgField(doc, "author")
if author != "Nested Author" {
t.Errorf("Expected nested author name, got %q", author)
}
})
t.Run("returns empty for invalid JSON", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
result := extractor.getSchemaOrgField(doc, "headline")
if result != "" {
t.Errorf("Expected empty for invalid JSON, got %q", result)
}
})
t.Run("returns empty for non-Article types", func(t *testing.T) {
htmlStr := `
`
doc := parseHTML(htmlStr)
result := extractor.getSchemaOrgField(doc, "headline")
if result != "" {
t.Errorf("Expected empty for WebPage type, got %q", result)
}
})
})
t.Run("ExtractMetadata", func(t *testing.T) {
extractor := NewMetadataExtractor()
t.Run("extracts all metadata fields", func(t *testing.T) {
htmlStr := `
Full Article Title
`
doc := parseHTML(htmlStr)
result := extractor.ExtractMetadata(doc)
if result == nil {
t.Fatal("Expected result, got nil")
}
if !strings.Contains(result.Title, "Full Article Title") {
t.Errorf("Expected title to be extracted, got %q", result.Title)
}
if result.Author != "Full Name" {
t.Errorf("Expected author to be extracted, got %q", result.Author)
}
if result.PublishedDate != "2025-01-20" {
t.Errorf("Expected date to be extracted, got %q", result.PublishedDate)
}
if result.SiteName != "News Site" {
t.Errorf("Expected site name to be extracted, got %q", result.SiteName)
}
if result.Language != "en" {
t.Errorf("Expected language to be extracted, got %q", result.Language)
}
})
t.Run("returns empty result for nil document", func(t *testing.T) {
result := extractor.ExtractMetadata(nil)
if result == nil {
t.Error("Expected empty result, got nil")
}
})
})
}