cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm
leaflet
readability
golang
1package articles
2
3import (
4 "encoding/json"
5 "strings"
6
7 "github.com/antchfx/htmlquery"
8 "golang.org/x/net/html"
9)
10
11// MetadataExtractor implements multi-strategy metadata extraction from HTML documents.
12// It attempts to extract article metadata using OpenGraph, Schema.org, meta tags,
13// and semantic HTML5 elements, with fallback chains for each field.
14type MetadataExtractor struct{}
15
16// NewMetadataExtractor creates a new metadata extractor.
17func NewMetadataExtractor() *MetadataExtractor {
18 return &MetadataExtractor{}
19}
20
21// ExtractMetadata extracts all available metadata from an HTML document.
22// Returns an ExtractionResult with populated metadata fields.
23func (m *MetadataExtractor) ExtractMetadata(doc *html.Node) *ExtractionResult {
24 if doc == nil {
25 return &ExtractionResult{}
26 }
27
28 result := &ExtractionResult{}
29
30 result.Title = m.ExtractTitle(doc)
31 result.Author = m.ExtractAuthor(doc)
32 result.PublishedDate = m.ExtractPublishedDate(doc)
33 result.SiteName = m.ExtractSiteName(doc)
34 result.Language = m.ExtractLanguage(doc)
35
36 return result
37}
38
39// ExtractTitle extracts the article title using multiple strategies.
40// Tries in order: OpenGraph, Schema.org, meta tags, h1, title tag.
41func (m *MetadataExtractor) ExtractTitle(doc *html.Node) string {
42 if doc == nil {
43 return ""
44 }
45
46 if title := m.getMetaContent(doc, "property", "og:title"); title != "" {
47 return title
48 }
49
50 if title := m.getSchemaOrgField(doc, "headline"); title != "" {
51 return title
52 }
53
54 if title := m.getSchemaOrgField(doc, "name"); title != "" {
55 return title
56 }
57
58 if title := m.getMetaContent(doc, "name", "twitter:title"); title != "" {
59 return title
60 }
61
62 if title := m.getMetaContent(doc, "property", "article:title"); title != "" {
63 return title
64 }
65
66 if h1 := htmlquery.FindOne(doc, "//h1"); h1 != nil {
67 if title := htmlquery.InnerText(h1); title != "" {
68 return strings.TrimSpace(title)
69 }
70 }
71
72 if titleNode := htmlquery.FindOne(doc, "//title"); titleNode != nil {
73 if title := htmlquery.InnerText(titleNode); title != "" {
74 return strings.TrimSpace(title)
75 }
76 }
77
78 return ""
79}
80
81// ExtractAuthor extracts the article author using multiple strategies.
82// Tries in order: OpenGraph, Schema.org, meta tags, rel=author, byline elements.
83func (m *MetadataExtractor) ExtractAuthor(doc *html.Node) string {
84 if doc == nil {
85 return ""
86 }
87
88 if author := m.getMetaContent(doc, "property", "og:author"); author != "" {
89 return author
90 }
91
92 if author := m.getSchemaOrgField(doc, "author"); author != "" {
93 return author
94 }
95
96 if author := m.getMetaContent(doc, "property", "article:author"); author != "" {
97 return author
98 }
99
100 if author := m.getMetaContent(doc, "name", "twitter:creator"); author != "" {
101 return author
102 }
103
104 if author := m.getMetaContent(doc, "name", "author"); author != "" {
105 return author
106 }
107
108 if authorLink := htmlquery.FindOne(doc, "//a[@rel='author']"); authorLink != nil {
109 if author := htmlquery.InnerText(authorLink); author != "" {
110 return strings.TrimSpace(author)
111 }
112 }
113
114 bylineSelectors := []string{
115 "//span[contains(@class, 'author')]",
116 "//div[contains(@class, 'author')]",
117 "//p[contains(@class, 'byline')]",
118 "//span[contains(@class, 'byline')]",
119 }
120
121 for _, selector := range bylineSelectors {
122 if node := htmlquery.FindOne(doc, selector); node != nil {
123 if author := htmlquery.InnerText(node); author != "" {
124 return strings.TrimSpace(author)
125 }
126 }
127 }
128
129 return ""
130}
131
132// ExtractPublishedDate extracts the publication date using multiple strategies.
133// Tries in order: OpenGraph, Schema.org, article:published_time, time elements.
134func (m *MetadataExtractor) ExtractPublishedDate(doc *html.Node) string {
135 if doc == nil {
136 return ""
137 }
138
139 if date := m.getMetaContent(doc, "property", "og:published_time"); date != "" {
140 return date
141 }
142
143 if date := m.getSchemaOrgField(doc, "datePublished"); date != "" {
144 return date
145 }
146
147 if date := m.getSchemaOrgField(doc, "publishDate"); date != "" {
148 return date
149 }
150
151 if date := m.getMetaContent(doc, "property", "article:published_time"); date != "" {
152 return date
153 }
154
155 if date := m.getMetaContent(doc, "name", "publication_date"); date != "" {
156 return date
157 }
158
159 if date := m.getMetaContent(doc, "name", "date"); date != "" {
160 return date
161 }
162
163 if timeNode := htmlquery.FindOne(doc, "//time[@datetime]"); timeNode != nil {
164 for _, attr := range timeNode.Attr {
165 if attr.Key == "datetime" {
166 return attr.Val
167 }
168 }
169 }
170
171 return ""
172}
173
174// ExtractSiteName extracts the site name using multiple strategies.
175// Tries in order: OpenGraph, Schema.org, meta tags.
176func (m *MetadataExtractor) ExtractSiteName(doc *html.Node) string {
177 if doc == nil {
178 return ""
179 }
180
181 if siteName := m.getMetaContent(doc, "property", "og:site_name"); siteName != "" {
182 return siteName
183 }
184
185 if publisher := m.getSchemaOrgField(doc, "publisher"); publisher != "" {
186 return publisher
187 }
188
189 if siteName := m.getMetaContent(doc, "name", "application-name"); siteName != "" {
190 return siteName
191 }
192
193 return ""
194}
195
196// ExtractLanguage extracts the document language.
197// Tries in order: html lang attribute, OpenGraph, meta tags.
198func (m *MetadataExtractor) ExtractLanguage(doc *html.Node) string {
199 if doc == nil {
200 return ""
201 }
202
203 if htmlNode := htmlquery.FindOne(doc, "//html"); htmlNode != nil {
204 for _, attr := range htmlNode.Attr {
205 if attr.Key == "lang" {
206 return attr.Val
207 }
208 }
209 }
210
211 if locale := m.getMetaContent(doc, "property", "og:locale"); locale != "" {
212 return locale
213 }
214
215 if lang := m.getMetaContent(doc, "http-equiv", "content-language"); lang != "" {
216 return lang
217 }
218
219 return ""
220}
221
222// getMetaContent retrieves the content attribute from a meta tag.
223// Searches for meta tags with the specified attribute name and value.
224func (m *MetadataExtractor) getMetaContent(doc *html.Node, attrName, attrValue string) string {
225 if doc == nil {
226 return ""
227 }
228
229 xpath := "//meta[@" + attrName + "='" + attrValue + "']"
230 metaNode := htmlquery.FindOne(doc, xpath)
231
232 if metaNode == nil {
233 return ""
234 }
235
236 for _, attr := range metaNode.Attr {
237 if attr.Key == "content" {
238 return strings.TrimSpace(attr.Val)
239 }
240 }
241
242 return ""
243}
244
245// getSchemaOrgField extracts a field from Schema.org JSON-LD structured data.
246func (m *MetadataExtractor) getSchemaOrgField(doc *html.Node, fieldName string) string {
247 if doc == nil {
248 return ""
249 }
250
251 scripts := htmlquery.Find(doc, "//script[@type='application/ld+json']")
252
253 for _, script := range scripts {
254 if script.FirstChild == nil || script.FirstChild.Type != html.TextNode {
255 continue
256 }
257
258 var data map[string]any
259 if err := json.Unmarshal([]byte(script.FirstChild.Data), &data); err != nil {
260 continue
261 }
262
263 context, hasContext := data["@context"]
264 typeVal, hasType := data["@type"]
265
266 if !hasContext || !hasType {
267 continue
268 }
269
270 contextStr, ok := context.(string)
271 if !ok || !strings.Contains(contextStr, "schema.org") {
272 continue
273 }
274
275 typeStr, ok := typeVal.(string)
276 if !ok || (!strings.Contains(typeStr, "Article") && !strings.Contains(typeStr, "NewsArticle")) {
277 continue
278 }
279
280 if value, exists := data[fieldName]; exists {
281 return m.extractStringValue(value)
282 }
283 }
284
285 return ""
286}
287
288// extractStringValue extracts a string from various JSON value types.
289func (m *MetadataExtractor) extractStringValue(value any) string {
290 switch v := value.(type) {
291 case string:
292 return v
293 case map[string]any:
294 if name, exists := v["name"]; exists {
295 if nameStr, ok := name.(string); ok {
296 return nameStr
297 }
298 }
299 case []any:
300 if len(v) > 0 {
301 return m.extractStringValue(v[0])
302 }
303 }
304 return ""
305}