internal/articles/metadata_test.go at main · desertthunder.dev/noteleaf

cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
noteleaf / internal / articles / metadata_test.go
at main 430 lines 11 kB view raw
  1package articles
  2
  3import (
  4	"strings"
  5	"testing"
  6)
  7
  8func TestMetadataExtractor(t *testing.T) {
  9	t.Run("NewMetadataExtractor", func(t *testing.T) {
 10		t.Run("creates extractor", func(t *testing.T) {
 11			extractor := NewMetadataExtractor()
 12
 13			if extractor == nil {
 14				t.Fatal("Expected extractor to be created, got nil")
 15			}
 16		})
 17	})
 18
 19	t.Run("ExtractTitle", func(t *testing.T) {
 20		extractor := NewMetadataExtractor()
 21
 22		t.Run("extracts from OpenGraph", func(t *testing.T) {
 23			htmlStr := `<html><head>
 24				<meta property="og:title" content="Article Title from OpenGraph">
 25			</head><body></body></html>`
 26			doc := parseHTML(htmlStr)
 27
 28			title := extractor.ExtractTitle(doc)
 29
 30			if title != "Article Title from OpenGraph" {
 31				t.Errorf("Expected OpenGraph title, got %q", title)
 32			}
 33		})
 34
 35		t.Run("extracts from title tag", func(t *testing.T) {
 36			htmlStr := `<html><head>
 37				<title>Page Title from Title Tag</title>
 38			</head><body></body></html>`
 39			doc := parseHTML(htmlStr)
 40
 41			title := extractor.ExtractTitle(doc)
 42
 43			if title != "Page Title from Title Tag" {
 44				t.Errorf("Expected title tag content, got %q", title)
 45			}
 46		})
 47
 48		t.Run("extracts from h1", func(t *testing.T) {
 49			htmlStr := `<html><body>
 50				<h1>Heading Title</h1>
 51			</body></html>`
 52			doc := parseHTML(htmlStr)
 53
 54			title := extractor.ExtractTitle(doc)
 55
 56			if title != "Heading Title" {
 57				t.Errorf("Expected h1 content, got %q", title)
 58			}
 59		})
 60
 61		t.Run("returns empty for nil document", func(t *testing.T) {
 62			title := extractor.ExtractTitle(nil)
 63
 64			if title != "" {
 65				t.Errorf("Expected empty string for nil document, got %q", title)
 66			}
 67		})
 68
 69		t.Run("prioritizes OpenGraph over title tag", func(t *testing.T) {
 70			htmlStr := `<html><head>
 71				<meta property="og:title" content="OpenGraph Title">
 72				<title>HTML Title</title>
 73			</head><body></body></html>`
 74			doc := parseHTML(htmlStr)
 75
 76			title := extractor.ExtractTitle(doc)
 77
 78			if title != "OpenGraph Title" {
 79				t.Errorf("Expected OpenGraph title to have priority, got %q", title)
 80			}
 81		})
 82	})
 83
 84	t.Run("ExtractAuthor", func(t *testing.T) {
 85		extractor := NewMetadataExtractor()
 86
 87		t.Run("extracts from OpenGraph", func(t *testing.T) {
 88			htmlStr := `<html><head>
 89				<meta property="og:author" content="John Doe">
 90			</head><body></body></html>`
 91			doc := parseHTML(htmlStr)
 92
 93			author := extractor.ExtractAuthor(doc)
 94
 95			if author != "John Doe" {
 96				t.Errorf("Expected OpenGraph author, got %q", author)
 97			}
 98		})
 99
100		t.Run("extracts from meta tag", func(t *testing.T) {
101			htmlStr := `<html><head>
102				<meta name="author" content="Jane Smith">
103			</head><body></body></html>`
104			doc := parseHTML(htmlStr)
105
106			author := extractor.ExtractAuthor(doc)
107
108			if author != "Jane Smith" {
109				t.Errorf("Expected meta tag author, got %q", author)
110			}
111		})
112
113		t.Run("extracts from rel=author link", func(t *testing.T) {
114			htmlStr := `<html><body>
115				<a rel="author" href="/author/bob">Bob Johnson</a>
116			</body></html>`
117			doc := parseHTML(htmlStr)
118
119			author := extractor.ExtractAuthor(doc)
120
121			if author != "Bob Johnson" {
122				t.Errorf("Expected rel=author link text, got %q", author)
123			}
124		})
125
126		t.Run("extracts from byline class", func(t *testing.T) {
127			htmlStr := `<html><body>
128				<span class="author-name">Alice Brown</span>
129			</body></html>`
130			doc := parseHTML(htmlStr)
131
132			author := extractor.ExtractAuthor(doc)
133
134			if author != "Alice Brown" {
135				t.Errorf("Expected byline class text, got %q", author)
136			}
137		})
138
139		t.Run("returns empty for nil document", func(t *testing.T) {
140			author := extractor.ExtractAuthor(nil)
141
142			if author != "" {
143				t.Errorf("Expected empty string for nil document, got %q", author)
144			}
145		})
146	})
147
148	t.Run("ExtractPublishedDate", func(t *testing.T) {
149		extractor := NewMetadataExtractor()
150
151		t.Run("extracts from OpenGraph", func(t *testing.T) {
152			htmlStr := `<html><head>
153				<meta property="og:published_time" content="2025-01-15T10:00:00Z">
154			</head><body></body></html>`
155			doc := parseHTML(htmlStr)
156
157			date := extractor.ExtractPublishedDate(doc)
158
159			if date != "2025-01-15T10:00:00Z" {
160				t.Errorf("Expected OpenGraph date, got %q", date)
161			}
162		})
163
164		t.Run("extracts from article:published_time", func(t *testing.T) {
165			htmlStr := `<html><head>
166				<meta property="article:published_time" content="2025-02-20">
167			</head><body></body></html>`
168			doc := parseHTML(htmlStr)
169
170			date := extractor.ExtractPublishedDate(doc)
171
172			if date != "2025-02-20" {
173				t.Errorf("Expected article:published_time, got %q", date)
174			}
175		})
176
177		t.Run("extracts from time element", func(t *testing.T) {
178			htmlStr := `<html><body>
179				<time datetime="2025-03-25T14:30:00">March 25, 2025</time>
180			</body></html>`
181			doc := parseHTML(htmlStr)
182
183			date := extractor.ExtractPublishedDate(doc)
184
185			if date != "2025-03-25T14:30:00" {
186				t.Errorf("Expected time element datetime, got %q", date)
187			}
188		})
189
190		t.Run("returns empty for nil document", func(t *testing.T) {
191			date := extractor.ExtractPublishedDate(nil)
192
193			if date != "" {
194				t.Errorf("Expected empty string for nil document, got %q", date)
195			}
196		})
197	})
198
199	t.Run("ExtractSiteName", func(t *testing.T) {
200		extractor := NewMetadataExtractor()
201
202		t.Run("extracts from OpenGraph", func(t *testing.T) {
203			htmlStr := `<html><head>
204				<meta property="og:site_name" content="Example News">
205			</head><body></body></html>`
206			doc := parseHTML(htmlStr)
207
208			siteName := extractor.ExtractSiteName(doc)
209
210			if siteName != "Example News" {
211				t.Errorf("Expected OpenGraph site_name, got %q", siteName)
212			}
213		})
214
215		t.Run("extracts from application-name", func(t *testing.T) {
216			htmlStr := `<html><head>
217				<meta name="application-name" content="Tech Blog">
218			</head><body></body></html>`
219			doc := parseHTML(htmlStr)
220
221			siteName := extractor.ExtractSiteName(doc)
222
223			if siteName != "Tech Blog" {
224				t.Errorf("Expected application-name, got %q", siteName)
225			}
226		})
227
228		t.Run("returns empty for nil document", func(t *testing.T) {
229			siteName := extractor.ExtractSiteName(nil)
230
231			if siteName != "" {
232				t.Errorf("Expected empty string for nil document, got %q", siteName)
233			}
234		})
235	})
236
237	t.Run("ExtractLanguage", func(t *testing.T) {
238		extractor := NewMetadataExtractor()
239
240		t.Run("extracts from html lang attribute", func(t *testing.T) {
241			htmlStr := `<html lang="en-US"><body></body></html>`
242			doc := parseHTML(htmlStr)
243
244			lang := extractor.ExtractLanguage(doc)
245
246			if lang != "en-US" {
247				t.Errorf("Expected html lang attribute, got %q", lang)
248			}
249		})
250
251		t.Run("extracts from OpenGraph locale", func(t *testing.T) {
252			htmlStr := `<html><head>
253				<meta property="og:locale" content="fr-FR">
254			</head><body></body></html>`
255			doc := parseHTML(htmlStr)
256
257			lang := extractor.ExtractLanguage(doc)
258
259			if lang != "fr-FR" {
260				t.Errorf("Expected OpenGraph locale, got %q", lang)
261			}
262		})
263
264		t.Run("returns empty for nil document", func(t *testing.T) {
265			lang := extractor.ExtractLanguage(nil)
266
267			if lang != "" {
268				t.Errorf("Expected empty string for nil document, got %q", lang)
269			}
270		})
271	})
272
273	t.Run("getSchemaOrgField", func(t *testing.T) {
274		extractor := NewMetadataExtractor()
275
276		t.Run("extracts from JSON-LD Article", func(t *testing.T) {
277			htmlStr := `<html><head>
278				<script type="application/ld+json">
279				{
280					"@context": "https://schema.org",
281					"@type": "Article",
282					"headline": "Test Article",
283					"author": "Test Author",
284					"datePublished": "2025-01-15"
285				}
286				</script>
287			</head><body></body></html>`
288			doc := parseHTML(htmlStr)
289
290			headline := extractor.getSchemaOrgField(doc, "headline")
291			author := extractor.getSchemaOrgField(doc, "author")
292			date := extractor.getSchemaOrgField(doc, "datePublished")
293
294			if headline != "Test Article" {
295				t.Errorf("Expected headline from JSON-LD, got %q", headline)
296			}
297
298			if author != "Test Author" {
299				t.Errorf("Expected author from JSON-LD, got %q", author)
300			}
301
302			if date != "2025-01-15" {
303				t.Errorf("Expected datePublished from JSON-LD, got %q", date)
304			}
305		})
306
307		t.Run("extracts from NewsArticle type", func(t *testing.T) {
308			htmlStr := `<html><head>
309				<script type="application/ld+json">
310				{
311					"@context": "https://schema.org",
312					"@type": "NewsArticle",
313					"headline": "Breaking News"
314				}
315				</script>
316			</head><body></body></html>`
317			doc := parseHTML(htmlStr)
318
319			headline := extractor.getSchemaOrgField(doc, "headline")
320
321			if headline != "Breaking News" {
322				t.Errorf("Expected headline from NewsArticle, got %q", headline)
323			}
324		})
325
326		t.Run("handles nested author object", func(t *testing.T) {
327			htmlStr := `<html><head>
328				<script type="application/ld+json">
329				{
330					"@context": "https://schema.org",
331					"@type": "Article",
332					"author": {
333						"@type": "Person",
334						"name": "Nested Author"
335					}
336				}
337				</script>
338			</head><body></body></html>`
339			doc := parseHTML(htmlStr)
340
341			author := extractor.getSchemaOrgField(doc, "author")
342
343			if author != "Nested Author" {
344				t.Errorf("Expected nested author name, got %q", author)
345			}
346		})
347
348		t.Run("returns empty for invalid JSON", func(t *testing.T) {
349			htmlStr := `<html><head>
350				<script type="application/ld+json">
351				{ invalid json }
352				</script>
353			</head><body></body></html>`
354			doc := parseHTML(htmlStr)
355
356			result := extractor.getSchemaOrgField(doc, "headline")
357
358			if result != "" {
359				t.Errorf("Expected empty for invalid JSON, got %q", result)
360			}
361		})
362
363		t.Run("returns empty for non-Article types", func(t *testing.T) {
364			htmlStr := `<html><head>
365				<script type="application/ld+json">
366				{
367					"@context": "https://schema.org",
368					"@type": "WebPage",
369					"headline": "Not an article"
370				}
371				</script>
372			</head><body></body></html>`
373			doc := parseHTML(htmlStr)
374
375			result := extractor.getSchemaOrgField(doc, "headline")
376
377			if result != "" {
378				t.Errorf("Expected empty for WebPage type, got %q", result)
379			}
380		})
381	})
382
383	t.Run("ExtractMetadata", func(t *testing.T) {
384		extractor := NewMetadataExtractor()
385
386		t.Run("extracts all metadata fields", func(t *testing.T) {
387			htmlStr := `<html lang="en"><head>
388				<title>Full Article Title</title>
389				<meta property="og:author" content="Full Name">
390				<meta property="article:published_time" content="2025-01-20">
391				<meta property="og:site_name" content="News Site">
392			</head><body></body></html>`
393			doc := parseHTML(htmlStr)
394
395			result := extractor.ExtractMetadata(doc)
396
397			if result == nil {
398				t.Fatal("Expected result, got nil")
399			}
400
401			if !strings.Contains(result.Title, "Full Article Title") {
402				t.Errorf("Expected title to be extracted, got %q", result.Title)
403			}
404
405			if result.Author != "Full Name" {
406				t.Errorf("Expected author to be extracted, got %q", result.Author)
407			}
408
409			if result.PublishedDate != "2025-01-20" {
410				t.Errorf("Expected date to be extracted, got %q", result.PublishedDate)
411			}
412
413			if result.SiteName != "News Site" {
414				t.Errorf("Expected site name to be extracted, got %q", result.SiteName)
415			}
416
417			if result.Language != "en" {
418				t.Errorf("Expected language to be extracted, got %q", result.Language)
419			}
420		})
421
422		t.Run("returns empty result for nil document", func(t *testing.T) {
423			result := extractor.ExtractMetadata(nil)
424
425			if result == nil {
426				t.Error("Expected empty result, got nil")
427			}
428		})
429	})
430}