internal/articles/metadata.go at main · desertthunder.dev/noteleaf

cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
noteleaf / internal / articles / metadata.go
at main 305 lines 7.5 kB view raw
  1package articles
  2
  3import (
  4	"encoding/json"
  5	"strings"
  6
  7	"github.com/antchfx/htmlquery"
  8	"golang.org/x/net/html"
  9)
 10
 11// MetadataExtractor implements multi-strategy metadata extraction from HTML documents.
 12// It attempts to extract article metadata using OpenGraph, Schema.org, meta tags,
 13// and semantic HTML5 elements, with fallback chains for each field.
 14type MetadataExtractor struct{}
 15
 16// NewMetadataExtractor creates a new metadata extractor.
 17func NewMetadataExtractor() *MetadataExtractor {
 18	return &MetadataExtractor{}
 19}
 20
 21// ExtractMetadata extracts all available metadata from an HTML document.
 22// Returns an ExtractionResult with populated metadata fields.
 23func (m *MetadataExtractor) ExtractMetadata(doc *html.Node) *ExtractionResult {
 24	if doc == nil {
 25		return &ExtractionResult{}
 26	}
 27
 28	result := &ExtractionResult{}
 29
 30	result.Title = m.ExtractTitle(doc)
 31	result.Author = m.ExtractAuthor(doc)
 32	result.PublishedDate = m.ExtractPublishedDate(doc)
 33	result.SiteName = m.ExtractSiteName(doc)
 34	result.Language = m.ExtractLanguage(doc)
 35
 36	return result
 37}
 38
 39// ExtractTitle extracts the article title using multiple strategies.
 40// Tries in order: OpenGraph, Schema.org, meta tags, h1, title tag.
 41func (m *MetadataExtractor) ExtractTitle(doc *html.Node) string {
 42	if doc == nil {
 43		return ""
 44	}
 45
 46	if title := m.getMetaContent(doc, "property", "og:title"); title != "" {
 47		return title
 48	}
 49
 50	if title := m.getSchemaOrgField(doc, "headline"); title != "" {
 51		return title
 52	}
 53
 54	if title := m.getSchemaOrgField(doc, "name"); title != "" {
 55		return title
 56	}
 57
 58	if title := m.getMetaContent(doc, "name", "twitter:title"); title != "" {
 59		return title
 60	}
 61
 62	if title := m.getMetaContent(doc, "property", "article:title"); title != "" {
 63		return title
 64	}
 65
 66	if h1 := htmlquery.FindOne(doc, "//h1"); h1 != nil {
 67		if title := htmlquery.InnerText(h1); title != "" {
 68			return strings.TrimSpace(title)
 69		}
 70	}
 71
 72	if titleNode := htmlquery.FindOne(doc, "//title"); titleNode != nil {
 73		if title := htmlquery.InnerText(titleNode); title != "" {
 74			return strings.TrimSpace(title)
 75		}
 76	}
 77
 78	return ""
 79}
 80
 81// ExtractAuthor extracts the article author using multiple strategies.
 82// Tries in order: OpenGraph, Schema.org, meta tags, rel=author, byline elements.
 83func (m *MetadataExtractor) ExtractAuthor(doc *html.Node) string {
 84	if doc == nil {
 85		return ""
 86	}
 87
 88	if author := m.getMetaContent(doc, "property", "og:author"); author != "" {
 89		return author
 90	}
 91
 92	if author := m.getSchemaOrgField(doc, "author"); author != "" {
 93		return author
 94	}
 95
 96	if author := m.getMetaContent(doc, "property", "article:author"); author != "" {
 97		return author
 98	}
 99
100	if author := m.getMetaContent(doc, "name", "twitter:creator"); author != "" {
101		return author
102	}
103
104	if author := m.getMetaContent(doc, "name", "author"); author != "" {
105		return author
106	}
107
108	if authorLink := htmlquery.FindOne(doc, "//a[@rel='author']"); authorLink != nil {
109		if author := htmlquery.InnerText(authorLink); author != "" {
110			return strings.TrimSpace(author)
111		}
112	}
113
114	bylineSelectors := []string{
115		"//span[contains(@class, 'author')]",
116		"//div[contains(@class, 'author')]",
117		"//p[contains(@class, 'byline')]",
118		"//span[contains(@class, 'byline')]",
119	}
120
121	for _, selector := range bylineSelectors {
122		if node := htmlquery.FindOne(doc, selector); node != nil {
123			if author := htmlquery.InnerText(node); author != "" {
124				return strings.TrimSpace(author)
125			}
126		}
127	}
128
129	return ""
130}
131
132// ExtractPublishedDate extracts the publication date using multiple strategies.
133// Tries in order: OpenGraph, Schema.org, article:published_time, time elements.
134func (m *MetadataExtractor) ExtractPublishedDate(doc *html.Node) string {
135	if doc == nil {
136		return ""
137	}
138
139	if date := m.getMetaContent(doc, "property", "og:published_time"); date != "" {
140		return date
141	}
142
143	if date := m.getSchemaOrgField(doc, "datePublished"); date != "" {
144		return date
145	}
146
147	if date := m.getSchemaOrgField(doc, "publishDate"); date != "" {
148		return date
149	}
150
151	if date := m.getMetaContent(doc, "property", "article:published_time"); date != "" {
152		return date
153	}
154
155	if date := m.getMetaContent(doc, "name", "publication_date"); date != "" {
156		return date
157	}
158
159	if date := m.getMetaContent(doc, "name", "date"); date != "" {
160		return date
161	}
162
163	if timeNode := htmlquery.FindOne(doc, "//time[@datetime]"); timeNode != nil {
164		for _, attr := range timeNode.Attr {
165			if attr.Key == "datetime" {
166				return attr.Val
167			}
168		}
169	}
170
171	return ""
172}
173
174// ExtractSiteName extracts the site name using multiple strategies.
175// Tries in order: OpenGraph, Schema.org, meta tags.
176func (m *MetadataExtractor) ExtractSiteName(doc *html.Node) string {
177	if doc == nil {
178		return ""
179	}
180
181	if siteName := m.getMetaContent(doc, "property", "og:site_name"); siteName != "" {
182		return siteName
183	}
184
185	if publisher := m.getSchemaOrgField(doc, "publisher"); publisher != "" {
186		return publisher
187	}
188
189	if siteName := m.getMetaContent(doc, "name", "application-name"); siteName != "" {
190		return siteName
191	}
192
193	return ""
194}
195
196// ExtractLanguage extracts the document language.
197// Tries in order: html lang attribute, OpenGraph, meta tags.
198func (m *MetadataExtractor) ExtractLanguage(doc *html.Node) string {
199	if doc == nil {
200		return ""
201	}
202
203	if htmlNode := htmlquery.FindOne(doc, "//html"); htmlNode != nil {
204		for _, attr := range htmlNode.Attr {
205			if attr.Key == "lang" {
206				return attr.Val
207			}
208		}
209	}
210
211	if locale := m.getMetaContent(doc, "property", "og:locale"); locale != "" {
212		return locale
213	}
214
215	if lang := m.getMetaContent(doc, "http-equiv", "content-language"); lang != "" {
216		return lang
217	}
218
219	return ""
220}
221
222// getMetaContent retrieves the content attribute from a meta tag.
223// Searches for meta tags with the specified attribute name and value.
224func (m *MetadataExtractor) getMetaContent(doc *html.Node, attrName, attrValue string) string {
225	if doc == nil {
226		return ""
227	}
228
229	xpath := "//meta[@" + attrName + "='" + attrValue + "']"
230	metaNode := htmlquery.FindOne(doc, xpath)
231
232	if metaNode == nil {
233		return ""
234	}
235
236	for _, attr := range metaNode.Attr {
237		if attr.Key == "content" {
238			return strings.TrimSpace(attr.Val)
239		}
240	}
241
242	return ""
243}
244
245// getSchemaOrgField extracts a field from Schema.org JSON-LD structured data.
246func (m *MetadataExtractor) getSchemaOrgField(doc *html.Node, fieldName string) string {
247	if doc == nil {
248		return ""
249	}
250
251	scripts := htmlquery.Find(doc, "//script[@type='application/ld+json']")
252
253	for _, script := range scripts {
254		if script.FirstChild == nil || script.FirstChild.Type != html.TextNode {
255			continue
256		}
257
258		var data map[string]any
259		if err := json.Unmarshal([]byte(script.FirstChild.Data), &data); err != nil {
260			continue
261		}
262
263		context, hasContext := data["@context"]
264		typeVal, hasType := data["@type"]
265
266		if !hasContext || !hasType {
267			continue
268		}
269
270		contextStr, ok := context.(string)
271		if !ok || !strings.Contains(contextStr, "schema.org") {
272			continue
273		}
274
275		typeStr, ok := typeVal.(string)
276		if !ok || (!strings.Contains(typeStr, "Article") && !strings.Contains(typeStr, "NewsArticle")) {
277			continue
278		}
279
280		if value, exists := data[fieldName]; exists {
281			return m.extractStringValue(value)
282		}
283	}
284
285	return ""
286}
287
288// extractStringValue extracts a string from various JSON value types.
289func (m *MetadataExtractor) extractStringValue(value any) string {
290	switch v := value.(type) {
291	case string:
292		return v
293	case map[string]any:
294		if name, exists := v["name"]; exists {
295			if nameStr, ok := name.(string); ok {
296				return nameStr
297			}
298		}
299	case []any:
300		if len(v) > 0 {
301			return m.extractStringValue(v[0])
302		}
303	}
304	return ""
305}