cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm
leaflet
readability
golang
1package articles
2
3import (
4 "strings"
5 "testing"
6)
7
8func TestMetadataExtractor(t *testing.T) {
9 t.Run("NewMetadataExtractor", func(t *testing.T) {
10 t.Run("creates extractor", func(t *testing.T) {
11 extractor := NewMetadataExtractor()
12
13 if extractor == nil {
14 t.Fatal("Expected extractor to be created, got nil")
15 }
16 })
17 })
18
19 t.Run("ExtractTitle", func(t *testing.T) {
20 extractor := NewMetadataExtractor()
21
22 t.Run("extracts from OpenGraph", func(t *testing.T) {
23 htmlStr := `<html><head>
24 <meta property="og:title" content="Article Title from OpenGraph">
25 </head><body></body></html>`
26 doc := parseHTML(htmlStr)
27
28 title := extractor.ExtractTitle(doc)
29
30 if title != "Article Title from OpenGraph" {
31 t.Errorf("Expected OpenGraph title, got %q", title)
32 }
33 })
34
35 t.Run("extracts from title tag", func(t *testing.T) {
36 htmlStr := `<html><head>
37 <title>Page Title from Title Tag</title>
38 </head><body></body></html>`
39 doc := parseHTML(htmlStr)
40
41 title := extractor.ExtractTitle(doc)
42
43 if title != "Page Title from Title Tag" {
44 t.Errorf("Expected title tag content, got %q", title)
45 }
46 })
47
48 t.Run("extracts from h1", func(t *testing.T) {
49 htmlStr := `<html><body>
50 <h1>Heading Title</h1>
51 </body></html>`
52 doc := parseHTML(htmlStr)
53
54 title := extractor.ExtractTitle(doc)
55
56 if title != "Heading Title" {
57 t.Errorf("Expected h1 content, got %q", title)
58 }
59 })
60
61 t.Run("returns empty for nil document", func(t *testing.T) {
62 title := extractor.ExtractTitle(nil)
63
64 if title != "" {
65 t.Errorf("Expected empty string for nil document, got %q", title)
66 }
67 })
68
69 t.Run("prioritizes OpenGraph over title tag", func(t *testing.T) {
70 htmlStr := `<html><head>
71 <meta property="og:title" content="OpenGraph Title">
72 <title>HTML Title</title>
73 </head><body></body></html>`
74 doc := parseHTML(htmlStr)
75
76 title := extractor.ExtractTitle(doc)
77
78 if title != "OpenGraph Title" {
79 t.Errorf("Expected OpenGraph title to have priority, got %q", title)
80 }
81 })
82 })
83
84 t.Run("ExtractAuthor", func(t *testing.T) {
85 extractor := NewMetadataExtractor()
86
87 t.Run("extracts from OpenGraph", func(t *testing.T) {
88 htmlStr := `<html><head>
89 <meta property="og:author" content="John Doe">
90 </head><body></body></html>`
91 doc := parseHTML(htmlStr)
92
93 author := extractor.ExtractAuthor(doc)
94
95 if author != "John Doe" {
96 t.Errorf("Expected OpenGraph author, got %q", author)
97 }
98 })
99
100 t.Run("extracts from meta tag", func(t *testing.T) {
101 htmlStr := `<html><head>
102 <meta name="author" content="Jane Smith">
103 </head><body></body></html>`
104 doc := parseHTML(htmlStr)
105
106 author := extractor.ExtractAuthor(doc)
107
108 if author != "Jane Smith" {
109 t.Errorf("Expected meta tag author, got %q", author)
110 }
111 })
112
113 t.Run("extracts from rel=author link", func(t *testing.T) {
114 htmlStr := `<html><body>
115 <a rel="author" href="/author/bob">Bob Johnson</a>
116 </body></html>`
117 doc := parseHTML(htmlStr)
118
119 author := extractor.ExtractAuthor(doc)
120
121 if author != "Bob Johnson" {
122 t.Errorf("Expected rel=author link text, got %q", author)
123 }
124 })
125
126 t.Run("extracts from byline class", func(t *testing.T) {
127 htmlStr := `<html><body>
128 <span class="author-name">Alice Brown</span>
129 </body></html>`
130 doc := parseHTML(htmlStr)
131
132 author := extractor.ExtractAuthor(doc)
133
134 if author != "Alice Brown" {
135 t.Errorf("Expected byline class text, got %q", author)
136 }
137 })
138
139 t.Run("returns empty for nil document", func(t *testing.T) {
140 author := extractor.ExtractAuthor(nil)
141
142 if author != "" {
143 t.Errorf("Expected empty string for nil document, got %q", author)
144 }
145 })
146 })
147
148 t.Run("ExtractPublishedDate", func(t *testing.T) {
149 extractor := NewMetadataExtractor()
150
151 t.Run("extracts from OpenGraph", func(t *testing.T) {
152 htmlStr := `<html><head>
153 <meta property="og:published_time" content="2025-01-15T10:00:00Z">
154 </head><body></body></html>`
155 doc := parseHTML(htmlStr)
156
157 date := extractor.ExtractPublishedDate(doc)
158
159 if date != "2025-01-15T10:00:00Z" {
160 t.Errorf("Expected OpenGraph date, got %q", date)
161 }
162 })
163
164 t.Run("extracts from article:published_time", func(t *testing.T) {
165 htmlStr := `<html><head>
166 <meta property="article:published_time" content="2025-02-20">
167 </head><body></body></html>`
168 doc := parseHTML(htmlStr)
169
170 date := extractor.ExtractPublishedDate(doc)
171
172 if date != "2025-02-20" {
173 t.Errorf("Expected article:published_time, got %q", date)
174 }
175 })
176
177 t.Run("extracts from time element", func(t *testing.T) {
178 htmlStr := `<html><body>
179 <time datetime="2025-03-25T14:30:00">March 25, 2025</time>
180 </body></html>`
181 doc := parseHTML(htmlStr)
182
183 date := extractor.ExtractPublishedDate(doc)
184
185 if date != "2025-03-25T14:30:00" {
186 t.Errorf("Expected time element datetime, got %q", date)
187 }
188 })
189
190 t.Run("returns empty for nil document", func(t *testing.T) {
191 date := extractor.ExtractPublishedDate(nil)
192
193 if date != "" {
194 t.Errorf("Expected empty string for nil document, got %q", date)
195 }
196 })
197 })
198
199 t.Run("ExtractSiteName", func(t *testing.T) {
200 extractor := NewMetadataExtractor()
201
202 t.Run("extracts from OpenGraph", func(t *testing.T) {
203 htmlStr := `<html><head>
204 <meta property="og:site_name" content="Example News">
205 </head><body></body></html>`
206 doc := parseHTML(htmlStr)
207
208 siteName := extractor.ExtractSiteName(doc)
209
210 if siteName != "Example News" {
211 t.Errorf("Expected OpenGraph site_name, got %q", siteName)
212 }
213 })
214
215 t.Run("extracts from application-name", func(t *testing.T) {
216 htmlStr := `<html><head>
217 <meta name="application-name" content="Tech Blog">
218 </head><body></body></html>`
219 doc := parseHTML(htmlStr)
220
221 siteName := extractor.ExtractSiteName(doc)
222
223 if siteName != "Tech Blog" {
224 t.Errorf("Expected application-name, got %q", siteName)
225 }
226 })
227
228 t.Run("returns empty for nil document", func(t *testing.T) {
229 siteName := extractor.ExtractSiteName(nil)
230
231 if siteName != "" {
232 t.Errorf("Expected empty string for nil document, got %q", siteName)
233 }
234 })
235 })
236
237 t.Run("ExtractLanguage", func(t *testing.T) {
238 extractor := NewMetadataExtractor()
239
240 t.Run("extracts from html lang attribute", func(t *testing.T) {
241 htmlStr := `<html lang="en-US"><body></body></html>`
242 doc := parseHTML(htmlStr)
243
244 lang := extractor.ExtractLanguage(doc)
245
246 if lang != "en-US" {
247 t.Errorf("Expected html lang attribute, got %q", lang)
248 }
249 })
250
251 t.Run("extracts from OpenGraph locale", func(t *testing.T) {
252 htmlStr := `<html><head>
253 <meta property="og:locale" content="fr-FR">
254 </head><body></body></html>`
255 doc := parseHTML(htmlStr)
256
257 lang := extractor.ExtractLanguage(doc)
258
259 if lang != "fr-FR" {
260 t.Errorf("Expected OpenGraph locale, got %q", lang)
261 }
262 })
263
264 t.Run("returns empty for nil document", func(t *testing.T) {
265 lang := extractor.ExtractLanguage(nil)
266
267 if lang != "" {
268 t.Errorf("Expected empty string for nil document, got %q", lang)
269 }
270 })
271 })
272
273 t.Run("getSchemaOrgField", func(t *testing.T) {
274 extractor := NewMetadataExtractor()
275
276 t.Run("extracts from JSON-LD Article", func(t *testing.T) {
277 htmlStr := `<html><head>
278 <script type="application/ld+json">
279 {
280 "@context": "https://schema.org",
281 "@type": "Article",
282 "headline": "Test Article",
283 "author": "Test Author",
284 "datePublished": "2025-01-15"
285 }
286 </script>
287 </head><body></body></html>`
288 doc := parseHTML(htmlStr)
289
290 headline := extractor.getSchemaOrgField(doc, "headline")
291 author := extractor.getSchemaOrgField(doc, "author")
292 date := extractor.getSchemaOrgField(doc, "datePublished")
293
294 if headline != "Test Article" {
295 t.Errorf("Expected headline from JSON-LD, got %q", headline)
296 }
297
298 if author != "Test Author" {
299 t.Errorf("Expected author from JSON-LD, got %q", author)
300 }
301
302 if date != "2025-01-15" {
303 t.Errorf("Expected datePublished from JSON-LD, got %q", date)
304 }
305 })
306
307 t.Run("extracts from NewsArticle type", func(t *testing.T) {
308 htmlStr := `<html><head>
309 <script type="application/ld+json">
310 {
311 "@context": "https://schema.org",
312 "@type": "NewsArticle",
313 "headline": "Breaking News"
314 }
315 </script>
316 </head><body></body></html>`
317 doc := parseHTML(htmlStr)
318
319 headline := extractor.getSchemaOrgField(doc, "headline")
320
321 if headline != "Breaking News" {
322 t.Errorf("Expected headline from NewsArticle, got %q", headline)
323 }
324 })
325
326 t.Run("handles nested author object", func(t *testing.T) {
327 htmlStr := `<html><head>
328 <script type="application/ld+json">
329 {
330 "@context": "https://schema.org",
331 "@type": "Article",
332 "author": {
333 "@type": "Person",
334 "name": "Nested Author"
335 }
336 }
337 </script>
338 </head><body></body></html>`
339 doc := parseHTML(htmlStr)
340
341 author := extractor.getSchemaOrgField(doc, "author")
342
343 if author != "Nested Author" {
344 t.Errorf("Expected nested author name, got %q", author)
345 }
346 })
347
348 t.Run("returns empty for invalid JSON", func(t *testing.T) {
349 htmlStr := `<html><head>
350 <script type="application/ld+json">
351 { invalid json }
352 </script>
353 </head><body></body></html>`
354 doc := parseHTML(htmlStr)
355
356 result := extractor.getSchemaOrgField(doc, "headline")
357
358 if result != "" {
359 t.Errorf("Expected empty for invalid JSON, got %q", result)
360 }
361 })
362
363 t.Run("returns empty for non-Article types", func(t *testing.T) {
364 htmlStr := `<html><head>
365 <script type="application/ld+json">
366 {
367 "@context": "https://schema.org",
368 "@type": "WebPage",
369 "headline": "Not an article"
370 }
371 </script>
372 </head><body></body></html>`
373 doc := parseHTML(htmlStr)
374
375 result := extractor.getSchemaOrgField(doc, "headline")
376
377 if result != "" {
378 t.Errorf("Expected empty for WebPage type, got %q", result)
379 }
380 })
381 })
382
383 t.Run("ExtractMetadata", func(t *testing.T) {
384 extractor := NewMetadataExtractor()
385
386 t.Run("extracts all metadata fields", func(t *testing.T) {
387 htmlStr := `<html lang="en"><head>
388 <title>Full Article Title</title>
389 <meta property="og:author" content="Full Name">
390 <meta property="article:published_time" content="2025-01-20">
391 <meta property="og:site_name" content="News Site">
392 </head><body></body></html>`
393 doc := parseHTML(htmlStr)
394
395 result := extractor.ExtractMetadata(doc)
396
397 if result == nil {
398 t.Fatal("Expected result, got nil")
399 }
400
401 if !strings.Contains(result.Title, "Full Article Title") {
402 t.Errorf("Expected title to be extracted, got %q", result.Title)
403 }
404
405 if result.Author != "Full Name" {
406 t.Errorf("Expected author to be extracted, got %q", result.Author)
407 }
408
409 if result.PublishedDate != "2025-01-20" {
410 t.Errorf("Expected date to be extracted, got %q", result.PublishedDate)
411 }
412
413 if result.SiteName != "News Site" {
414 t.Errorf("Expected site name to be extracted, got %q", result.SiteName)
415 }
416
417 if result.Language != "en" {
418 t.Errorf("Expected language to be extracted, got %q", result.Language)
419 }
420 })
421
422 t.Run("returns empty result for nil document", func(t *testing.T) {
423 result := extractor.ExtractMetadata(nil)
424
425 if result == nil {
426 t.Error("Expected empty result, got nil")
427 }
428 })
429 })
430}