cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
at main 430 lines 11 kB view raw
1package articles 2 3import ( 4 "strings" 5 "testing" 6) 7 8func TestMetadataExtractor(t *testing.T) { 9 t.Run("NewMetadataExtractor", func(t *testing.T) { 10 t.Run("creates extractor", func(t *testing.T) { 11 extractor := NewMetadataExtractor() 12 13 if extractor == nil { 14 t.Fatal("Expected extractor to be created, got nil") 15 } 16 }) 17 }) 18 19 t.Run("ExtractTitle", func(t *testing.T) { 20 extractor := NewMetadataExtractor() 21 22 t.Run("extracts from OpenGraph", func(t *testing.T) { 23 htmlStr := `<html><head> 24 <meta property="og:title" content="Article Title from OpenGraph"> 25 </head><body></body></html>` 26 doc := parseHTML(htmlStr) 27 28 title := extractor.ExtractTitle(doc) 29 30 if title != "Article Title from OpenGraph" { 31 t.Errorf("Expected OpenGraph title, got %q", title) 32 } 33 }) 34 35 t.Run("extracts from title tag", func(t *testing.T) { 36 htmlStr := `<html><head> 37 <title>Page Title from Title Tag</title> 38 </head><body></body></html>` 39 doc := parseHTML(htmlStr) 40 41 title := extractor.ExtractTitle(doc) 42 43 if title != "Page Title from Title Tag" { 44 t.Errorf("Expected title tag content, got %q", title) 45 } 46 }) 47 48 t.Run("extracts from h1", func(t *testing.T) { 49 htmlStr := `<html><body> 50 <h1>Heading Title</h1> 51 </body></html>` 52 doc := parseHTML(htmlStr) 53 54 title := extractor.ExtractTitle(doc) 55 56 if title != "Heading Title" { 57 t.Errorf("Expected h1 content, got %q", title) 58 } 59 }) 60 61 t.Run("returns empty for nil document", func(t *testing.T) { 62 title := extractor.ExtractTitle(nil) 63 64 if title != "" { 65 t.Errorf("Expected empty string for nil document, got %q", title) 66 } 67 }) 68 69 t.Run("prioritizes OpenGraph over title tag", func(t *testing.T) { 70 htmlStr := `<html><head> 71 <meta property="og:title" content="OpenGraph Title"> 72 <title>HTML Title</title> 73 </head><body></body></html>` 74 doc := parseHTML(htmlStr) 75 76 title := extractor.ExtractTitle(doc) 77 78 if title != "OpenGraph Title" { 79 t.Errorf("Expected OpenGraph title to have priority, got %q", title) 80 } 81 }) 82 }) 83 84 t.Run("ExtractAuthor", func(t *testing.T) { 85 extractor := NewMetadataExtractor() 86 87 t.Run("extracts from OpenGraph", func(t *testing.T) { 88 htmlStr := `<html><head> 89 <meta property="og:author" content="John Doe"> 90 </head><body></body></html>` 91 doc := parseHTML(htmlStr) 92 93 author := extractor.ExtractAuthor(doc) 94 95 if author != "John Doe" { 96 t.Errorf("Expected OpenGraph author, got %q", author) 97 } 98 }) 99 100 t.Run("extracts from meta tag", func(t *testing.T) { 101 htmlStr := `<html><head> 102 <meta name="author" content="Jane Smith"> 103 </head><body></body></html>` 104 doc := parseHTML(htmlStr) 105 106 author := extractor.ExtractAuthor(doc) 107 108 if author != "Jane Smith" { 109 t.Errorf("Expected meta tag author, got %q", author) 110 } 111 }) 112 113 t.Run("extracts from rel=author link", func(t *testing.T) { 114 htmlStr := `<html><body> 115 <a rel="author" href="/author/bob">Bob Johnson</a> 116 </body></html>` 117 doc := parseHTML(htmlStr) 118 119 author := extractor.ExtractAuthor(doc) 120 121 if author != "Bob Johnson" { 122 t.Errorf("Expected rel=author link text, got %q", author) 123 } 124 }) 125 126 t.Run("extracts from byline class", func(t *testing.T) { 127 htmlStr := `<html><body> 128 <span class="author-name">Alice Brown</span> 129 </body></html>` 130 doc := parseHTML(htmlStr) 131 132 author := extractor.ExtractAuthor(doc) 133 134 if author != "Alice Brown" { 135 t.Errorf("Expected byline class text, got %q", author) 136 } 137 }) 138 139 t.Run("returns empty for nil document", func(t *testing.T) { 140 author := extractor.ExtractAuthor(nil) 141 142 if author != "" { 143 t.Errorf("Expected empty string for nil document, got %q", author) 144 } 145 }) 146 }) 147 148 t.Run("ExtractPublishedDate", func(t *testing.T) { 149 extractor := NewMetadataExtractor() 150 151 t.Run("extracts from OpenGraph", func(t *testing.T) { 152 htmlStr := `<html><head> 153 <meta property="og:published_time" content="2025-01-15T10:00:00Z"> 154 </head><body></body></html>` 155 doc := parseHTML(htmlStr) 156 157 date := extractor.ExtractPublishedDate(doc) 158 159 if date != "2025-01-15T10:00:00Z" { 160 t.Errorf("Expected OpenGraph date, got %q", date) 161 } 162 }) 163 164 t.Run("extracts from article:published_time", func(t *testing.T) { 165 htmlStr := `<html><head> 166 <meta property="article:published_time" content="2025-02-20"> 167 </head><body></body></html>` 168 doc := parseHTML(htmlStr) 169 170 date := extractor.ExtractPublishedDate(doc) 171 172 if date != "2025-02-20" { 173 t.Errorf("Expected article:published_time, got %q", date) 174 } 175 }) 176 177 t.Run("extracts from time element", func(t *testing.T) { 178 htmlStr := `<html><body> 179 <time datetime="2025-03-25T14:30:00">March 25, 2025</time> 180 </body></html>` 181 doc := parseHTML(htmlStr) 182 183 date := extractor.ExtractPublishedDate(doc) 184 185 if date != "2025-03-25T14:30:00" { 186 t.Errorf("Expected time element datetime, got %q", date) 187 } 188 }) 189 190 t.Run("returns empty for nil document", func(t *testing.T) { 191 date := extractor.ExtractPublishedDate(nil) 192 193 if date != "" { 194 t.Errorf("Expected empty string for nil document, got %q", date) 195 } 196 }) 197 }) 198 199 t.Run("ExtractSiteName", func(t *testing.T) { 200 extractor := NewMetadataExtractor() 201 202 t.Run("extracts from OpenGraph", func(t *testing.T) { 203 htmlStr := `<html><head> 204 <meta property="og:site_name" content="Example News"> 205 </head><body></body></html>` 206 doc := parseHTML(htmlStr) 207 208 siteName := extractor.ExtractSiteName(doc) 209 210 if siteName != "Example News" { 211 t.Errorf("Expected OpenGraph site_name, got %q", siteName) 212 } 213 }) 214 215 t.Run("extracts from application-name", func(t *testing.T) { 216 htmlStr := `<html><head> 217 <meta name="application-name" content="Tech Blog"> 218 </head><body></body></html>` 219 doc := parseHTML(htmlStr) 220 221 siteName := extractor.ExtractSiteName(doc) 222 223 if siteName != "Tech Blog" { 224 t.Errorf("Expected application-name, got %q", siteName) 225 } 226 }) 227 228 t.Run("returns empty for nil document", func(t *testing.T) { 229 siteName := extractor.ExtractSiteName(nil) 230 231 if siteName != "" { 232 t.Errorf("Expected empty string for nil document, got %q", siteName) 233 } 234 }) 235 }) 236 237 t.Run("ExtractLanguage", func(t *testing.T) { 238 extractor := NewMetadataExtractor() 239 240 t.Run("extracts from html lang attribute", func(t *testing.T) { 241 htmlStr := `<html lang="en-US"><body></body></html>` 242 doc := parseHTML(htmlStr) 243 244 lang := extractor.ExtractLanguage(doc) 245 246 if lang != "en-US" { 247 t.Errorf("Expected html lang attribute, got %q", lang) 248 } 249 }) 250 251 t.Run("extracts from OpenGraph locale", func(t *testing.T) { 252 htmlStr := `<html><head> 253 <meta property="og:locale" content="fr-FR"> 254 </head><body></body></html>` 255 doc := parseHTML(htmlStr) 256 257 lang := extractor.ExtractLanguage(doc) 258 259 if lang != "fr-FR" { 260 t.Errorf("Expected OpenGraph locale, got %q", lang) 261 } 262 }) 263 264 t.Run("returns empty for nil document", func(t *testing.T) { 265 lang := extractor.ExtractLanguage(nil) 266 267 if lang != "" { 268 t.Errorf("Expected empty string for nil document, got %q", lang) 269 } 270 }) 271 }) 272 273 t.Run("getSchemaOrgField", func(t *testing.T) { 274 extractor := NewMetadataExtractor() 275 276 t.Run("extracts from JSON-LD Article", func(t *testing.T) { 277 htmlStr := `<html><head> 278 <script type="application/ld+json"> 279 { 280 "@context": "https://schema.org", 281 "@type": "Article", 282 "headline": "Test Article", 283 "author": "Test Author", 284 "datePublished": "2025-01-15" 285 } 286 </script> 287 </head><body></body></html>` 288 doc := parseHTML(htmlStr) 289 290 headline := extractor.getSchemaOrgField(doc, "headline") 291 author := extractor.getSchemaOrgField(doc, "author") 292 date := extractor.getSchemaOrgField(doc, "datePublished") 293 294 if headline != "Test Article" { 295 t.Errorf("Expected headline from JSON-LD, got %q", headline) 296 } 297 298 if author != "Test Author" { 299 t.Errorf("Expected author from JSON-LD, got %q", author) 300 } 301 302 if date != "2025-01-15" { 303 t.Errorf("Expected datePublished from JSON-LD, got %q", date) 304 } 305 }) 306 307 t.Run("extracts from NewsArticle type", func(t *testing.T) { 308 htmlStr := `<html><head> 309 <script type="application/ld+json"> 310 { 311 "@context": "https://schema.org", 312 "@type": "NewsArticle", 313 "headline": "Breaking News" 314 } 315 </script> 316 </head><body></body></html>` 317 doc := parseHTML(htmlStr) 318 319 headline := extractor.getSchemaOrgField(doc, "headline") 320 321 if headline != "Breaking News" { 322 t.Errorf("Expected headline from NewsArticle, got %q", headline) 323 } 324 }) 325 326 t.Run("handles nested author object", func(t *testing.T) { 327 htmlStr := `<html><head> 328 <script type="application/ld+json"> 329 { 330 "@context": "https://schema.org", 331 "@type": "Article", 332 "author": { 333 "@type": "Person", 334 "name": "Nested Author" 335 } 336 } 337 </script> 338 </head><body></body></html>` 339 doc := parseHTML(htmlStr) 340 341 author := extractor.getSchemaOrgField(doc, "author") 342 343 if author != "Nested Author" { 344 t.Errorf("Expected nested author name, got %q", author) 345 } 346 }) 347 348 t.Run("returns empty for invalid JSON", func(t *testing.T) { 349 htmlStr := `<html><head> 350 <script type="application/ld+json"> 351 { invalid json } 352 </script> 353 </head><body></body></html>` 354 doc := parseHTML(htmlStr) 355 356 result := extractor.getSchemaOrgField(doc, "headline") 357 358 if result != "" { 359 t.Errorf("Expected empty for invalid JSON, got %q", result) 360 } 361 }) 362 363 t.Run("returns empty for non-Article types", func(t *testing.T) { 364 htmlStr := `<html><head> 365 <script type="application/ld+json"> 366 { 367 "@context": "https://schema.org", 368 "@type": "WebPage", 369 "headline": "Not an article" 370 } 371 </script> 372 </head><body></body></html>` 373 doc := parseHTML(htmlStr) 374 375 result := extractor.getSchemaOrgField(doc, "headline") 376 377 if result != "" { 378 t.Errorf("Expected empty for WebPage type, got %q", result) 379 } 380 }) 381 }) 382 383 t.Run("ExtractMetadata", func(t *testing.T) { 384 extractor := NewMetadataExtractor() 385 386 t.Run("extracts all metadata fields", func(t *testing.T) { 387 htmlStr := `<html lang="en"><head> 388 <title>Full Article Title</title> 389 <meta property="og:author" content="Full Name"> 390 <meta property="article:published_time" content="2025-01-20"> 391 <meta property="og:site_name" content="News Site"> 392 </head><body></body></html>` 393 doc := parseHTML(htmlStr) 394 395 result := extractor.ExtractMetadata(doc) 396 397 if result == nil { 398 t.Fatal("Expected result, got nil") 399 } 400 401 if !strings.Contains(result.Title, "Full Article Title") { 402 t.Errorf("Expected title to be extracted, got %q", result.Title) 403 } 404 405 if result.Author != "Full Name" { 406 t.Errorf("Expected author to be extracted, got %q", result.Author) 407 } 408 409 if result.PublishedDate != "2025-01-20" { 410 t.Errorf("Expected date to be extracted, got %q", result.PublishedDate) 411 } 412 413 if result.SiteName != "News Site" { 414 t.Errorf("Expected site name to be extracted, got %q", result.SiteName) 415 } 416 417 if result.Language != "en" { 418 t.Errorf("Expected language to be extracted, got %q", result.Language) 419 } 420 }) 421 422 t.Run("returns empty result for nil document", func(t *testing.T) { 423 result := extractor.ExtractMetadata(nil) 424 425 if result == nil { 426 t.Error("Expected empty result, got nil") 427 } 428 }) 429 }) 430}